Index: head/lib/libstand/ext2fs.c =================================================================== --- head/lib/libstand/ext2fs.c (revision 313779) +++ head/lib/libstand/ext2fs.c (revision 313780) @@ -1,908 +1,908 @@ /*- * Copyright (c) 1999,2000 Jonathan Lemon * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); /*- * Copyright (c) 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * The Mach Operating System project at Carnegie-Mellon University. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * * Copyright (c) 1990, 1991 Carnegie Mellon University * All Rights Reserved. * * Author: David Golub * * Permission to use, copy, modify and distribute this software and its * documentation is hereby granted, provided that both the copyright * notice and this permission notice appear in all copies of the * software, derivative works or modified versions, and any portions * thereof, and that both notices appear in supporting documentation. * * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. * * Carnegie Mellon requests users of this software to return to * * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU * School of Computer Science * Carnegie Mellon University * Pittsburgh PA 15213-3890 * * any improvements or extensions that they make and grant Carnegie the * rights to redistribute these changes. */ #include #include #include "stand.h" #include "string.h" static int ext2fs_open(const char *path, struct open_file *f); static int ext2fs_close(struct open_file *f); static int ext2fs_read(struct open_file *f, void *buf, size_t size, size_t *resid); static off_t ext2fs_seek(struct open_file *f, off_t offset, int where); static int ext2fs_stat(struct open_file *f, struct stat *sb); static int ext2fs_readdir(struct open_file *f, struct dirent *d); static int dtmap[] = { DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK }; #define EXTFTODT(x) (x) > sizeof(dtmap) / sizeof(dtmap[0]) ? \ DT_UNKNOWN : dtmap[x] struct fs_ops ext2fs_fsops = { "ext2fs", ext2fs_open, ext2fs_close, ext2fs_read, null_write, ext2fs_seek, ext2fs_stat, ext2fs_readdir }; #define EXT2_SBSIZE 1024 #define EXT2_SBLOCK (1024 / DEV_BSIZE) /* block offset of superblock */ #define EXT2_MAGIC 0xef53 #define EXT2_ROOTINO 2 #define EXT2_REV0 0 /* original revision of ext2 */ #define EXT2_R0_ISIZE 128 /* inode size */ #define EXT2_R0_FIRSTINO 11 /* first inode */ #define EXT2_MINBSHIFT 10 /* mininum block shift */ #define EXT2_MINFSHIFT 10 /* mininum frag shift */ -#define NDADDR 12 /* # of direct blocks */ -#define NIADDR 3 /* # of indirect blocks */ +#define EXT2_NDADDR 12 /* # of direct blocks */ +#define EXT2_NIADDR 3 /* # of indirect blocks */ /* * file system block to disk address */ #define fsb_to_db(fs, blk) ((blk) << (fs)->fs_fsbtodb) /* * inode to block group offset * inode to block group * inode to disk address * inode to block offset */ #define ino_to_bgo(fs, ino) (((ino) - 1) % (fs)->fs_ipg) #define ino_to_bg(fs, ino) (((ino) - 1) / (fs)->fs_ipg) #define ino_to_db(fs, bg, ino) \ fsb_to_db(fs, ((bg)[ino_to_bg(fs, ino)].bg_inotbl + \ ino_to_bgo(fs, ino) / (fs)->fs_ipb)) #define ino_to_bo(fs, ino) (ino_to_bgo(fs, ino) % (fs)->fs_ipb) #define nindir(fs) \ ((fs)->fs_bsize / sizeof(u_int32_t)) #define lblkno(fs, loc) /* loc / bsize */ \ ((loc) >> (fs)->fs_bshift) #define smalllblktosize(fs, blk) /* blk * bsize */ \ ((blk) << (fs)->fs_bshift) #define blkoff(fs, loc) /* loc % bsize */ \ ((loc) & (fs)->fs_bmask) #define fragroundup(fs, size) /* roundup(size, fsize) */ \ (((size) + (fs)->fs_fmask) & ~(fs)->fs_fmask) #define dblksize(fs, dip, lbn) \ - (((lbn) >= NDADDR || (dip)->di_size >= smalllblktosize(fs, (lbn) + 1)) \ + (((lbn) >= EXT2_NDADDR || (dip)->di_size >= smalllblktosize(fs, (lbn) + 1)) \ ? (fs)->fs_bsize \ : (fragroundup(fs, blkoff(fs, (dip)->di_size)))) /* * superblock describing ext2fs */ struct ext2fs_disk { u_int32_t fd_inodes; /* # of inodes */ u_int32_t fd_blocks; /* # of blocks */ u_int32_t fd_resblk; /* # of reserved blocks */ u_int32_t fd_freeblk; /* # of free blocks */ u_int32_t fd_freeino; /* # of free inodes */ u_int32_t fd_firstblk; /* first data block */ u_int32_t fd_bsize; /* block size */ u_int32_t fd_fsize; /* frag size */ u_int32_t fd_bpg; /* blocks per group */ u_int32_t fd_fpg; /* frags per group */ u_int32_t fd_ipg; /* inodes per group */ u_int32_t fd_mtime; /* mount time */ u_int32_t fd_wtime; /* write time */ u_int16_t fd_mount; /* # of mounts */ int16_t fd_maxmount; /* max # of mounts */ u_int16_t fd_magic; /* magic number */ u_int16_t fd_state; /* state */ u_int16_t fd_eflag; /* error flags */ u_int16_t fd_mnrrev; /* minor revision */ u_int32_t fd_lastchk; /* last check */ u_int32_t fd_chkintvl; /* maximum check interval */ u_int32_t fd_os; /* os */ u_int32_t fd_revision; /* revision */ u_int16_t fd_uid; /* uid for reserved blocks */ u_int16_t fd_gid; /* gid for reserved blocks */ u_int32_t fd_firstino; /* first non-reserved inode */ u_int16_t fd_isize; /* inode size */ u_int16_t fd_nblkgrp; /* block group # of superblock */ u_int32_t fd_fcompat; /* compatible features */ u_int32_t fd_fincompat; /* incompatible features */ u_int32_t fd_frocompat; /* read-only compatibilties */ u_int8_t fd_uuid[16]; /* volume uuid */ char fd_volname[16]; /* volume name */ char fd_fsmnt[64]; /* name last mounted on */ u_int32_t fd_bitmap; /* compression bitmap */ u_int8_t fd_nblkpa; /* # of blocks to preallocate */ u_int8_t fd_ndblkpa; /* # of dir blocks to preallocate */ }; struct ext2fs_core { int fc_bsize; /* block size */ int fc_bshift; /* block shift amount */ int fc_bmask; /* block mask */ int fc_fsize; /* frag size */ int fc_fshift; /* frag shift amount */ int fc_fmask; /* frag mask */ int fc_isize; /* inode size */ int fc_imask; /* inode mask */ int fc_firstino; /* first non-reserved inode */ int fc_ipb; /* inodes per block */ int fc_fsbtodb; /* fsb to ds shift */ }; struct ext2fs { struct ext2fs_disk fs_fd; char fs_pad[EXT2_SBSIZE - sizeof(struct ext2fs_disk)]; struct ext2fs_core fs_fc; #define fs_magic fs_fd.fd_magic #define fs_revision fs_fd.fd_revision #define fs_blocks fs_fd.fd_blocks #define fs_firstblk fs_fd.fd_firstblk #define fs_bpg fs_fd.fd_bpg #define fs_ipg fs_fd.fd_ipg #define fs_bsize fs_fc.fc_bsize #define fs_bshift fs_fc.fc_bshift #define fs_bmask fs_fc.fc_bmask #define fs_fsize fs_fc.fc_fsize #define fs_fshift fs_fc.fc_fshift #define fs_fmask fs_fc.fc_fmask #define fs_isize fs_fc.fc_isize #define fs_imask fs_fc.fc_imask #define fs_firstino fs_fc.fc_firstino #define fs_ipb fs_fc.fc_ipb #define fs_fsbtodb fs_fc.fc_fsbtodb }; struct ext2blkgrp { u_int32_t bg_blkmap; /* block bitmap */ u_int32_t bg_inomap; /* inode bitmap */ u_int32_t bg_inotbl; /* inode table */ u_int16_t bg_nfblk; /* # of free blocks */ u_int16_t bg_nfino; /* # of free inodes */ u_int16_t bg_ndirs; /* # of dirs */ char bg_pad[14]; }; struct ext2dinode { u_int16_t di_mode; /* mode */ u_int16_t di_uid; /* uid */ u_int32_t di_size; /* byte size */ u_int32_t di_atime; /* access time */ u_int32_t di_ctime; /* creation time */ u_int32_t di_mtime; /* modification time */ u_int32_t di_dtime; /* deletion time */ u_int16_t di_gid; /* gid */ u_int16_t di_nlink; /* link count */ u_int32_t di_nblk; /* block count */ u_int32_t di_flags; /* file flags */ u_int32_t di_osdep1; /* os dependent stuff */ - u_int32_t di_db[NDADDR]; /* direct blocks */ - u_int32_t di_ib[NIADDR]; /* indirect blocks */ + u_int32_t di_db[EXT2_NDADDR]; /* direct blocks */ + u_int32_t di_ib[EXT2_NIADDR]; /* indirect blocks */ u_int32_t di_version; /* version */ u_int32_t di_facl; /* file acl */ u_int32_t di_dacl; /* dir acl */ u_int32_t di_faddr; /* fragment addr */ u_int8_t di_frag; /* fragment number */ u_int8_t di_fsize; /* fragment size */ char di_pad[10]; #define di_shortlink di_db }; #define EXT2_MAXNAMLEN 255 struct ext2dirent { u_int32_t d_ino; /* inode */ u_int16_t d_reclen; /* directory entry length */ u_int8_t d_namlen; /* name length */ u_int8_t d_type; /* file type */ char d_name[EXT2_MAXNAMLEN]; }; struct file { off_t f_seekp; /* seek pointer */ struct ext2fs *f_fs; /* pointer to super-block */ struct ext2blkgrp *f_bg; /* pointer to blkgrp map */ struct ext2dinode f_di; /* copy of on-disk inode */ - int f_nindir[NIADDR]; /* number of blocks mapped by + int f_nindir[EXT2_NIADDR]; /* number of blocks mapped by indirect block at level i */ - char *f_blk[NIADDR]; /* buffer for indirect block + char *f_blk[EXT2_NIADDR]; /* buffer for indirect block at level i */ - size_t f_blksize[NIADDR]; /* size of buffer */ - daddr_t f_blkno[NIADDR]; /* disk address of block in + size_t f_blksize[EXT2_NIADDR]; /* size of buffer */ + daddr_t f_blkno[EXT2_NIADDR]; /* disk address of block in buffer */ char *f_buf; /* buffer for data block */ size_t f_buf_size; /* size of data block */ daddr_t f_buf_blkno; /* block number of data block */ }; /* forward decls */ static int read_inode(ino_t inumber, struct open_file *f); static int block_map(struct open_file *f, daddr_t file_block, daddr_t *disk_block_p); static int buf_read_file(struct open_file *f, char **buf_p, size_t *size_p); static int search_directory(char *name, struct open_file *f, ino_t *inumber_p); /* * Open a file. */ static int ext2fs_open(const char *upath, struct open_file *f) { struct file *fp; struct ext2fs *fs; size_t buf_size; ino_t inumber, parent_inumber; int i, len, groups, bg_per_blk, blkgrps, mult; int nlinks = 0; int error = 0; char *cp, *ncp, *path = NULL, *buf = NULL; char namebuf[MAXPATHLEN+1]; char c; /* allocate file system specific data structure */ fp = malloc(sizeof(struct file)); if (fp == NULL) return (ENOMEM); bzero(fp, sizeof(struct file)); f->f_fsdata = (void *)fp; /* allocate space and read super block */ fs = (struct ext2fs *)malloc(sizeof(*fs)); fp->f_fs = fs; twiddle(1); error = (f->f_dev->dv_strategy)(f->f_devdata, F_READ, EXT2_SBLOCK, EXT2_SBSIZE, (char *)fs, &buf_size); if (error) goto out; if (buf_size != EXT2_SBSIZE || fs->fs_magic != EXT2_MAGIC) { error = EINVAL; goto out; } /* * compute in-core values for the superblock */ fs->fs_bshift = EXT2_MINBSHIFT + fs->fs_fd.fd_bsize; fs->fs_bsize = 1 << fs->fs_bshift; fs->fs_bmask = fs->fs_bsize - 1; fs->fs_fshift = EXT2_MINFSHIFT + fs->fs_fd.fd_fsize; fs->fs_fsize = 1 << fs->fs_fshift; fs->fs_fmask = fs->fs_fsize - 1; if (fs->fs_revision == EXT2_REV0) { fs->fs_isize = EXT2_R0_ISIZE; fs->fs_firstino = EXT2_R0_FIRSTINO; } else { fs->fs_isize = fs->fs_fd.fd_isize; fs->fs_firstino = fs->fs_fd.fd_firstino; } fs->fs_imask = fs->fs_isize - 1; fs->fs_ipb = fs->fs_bsize / fs->fs_isize; fs->fs_fsbtodb = (fs->fs_bsize / DEV_BSIZE) - 1; /* * we have to load in the "group descriptors" here */ groups = howmany(fs->fs_blocks - fs->fs_firstblk, fs->fs_bpg); bg_per_blk = fs->fs_bsize / sizeof(struct ext2blkgrp); blkgrps = howmany(groups, bg_per_blk); len = blkgrps * fs->fs_bsize; fp->f_bg = malloc(len); twiddle(1); error = (f->f_dev->dv_strategy)(f->f_devdata, F_READ, EXT2_SBLOCK + EXT2_SBSIZE / DEV_BSIZE, len, (char *)fp->f_bg, &buf_size); if (error) goto out; /* * XXX * validation of values? (blocksize, descriptors, etc?) */ /* * Calculate indirect block levels. */ mult = 1; - for (i = 0; i < NIADDR; i++) { + for (i = 0; i < EXT2_NIADDR; i++) { mult *= nindir(fs); fp->f_nindir[i] = mult; } inumber = EXT2_ROOTINO; if ((error = read_inode(inumber, f)) != 0) goto out; path = strdup(upath); if (path == NULL) { error = ENOMEM; goto out; } cp = path; while (*cp) { /* * Remove extra separators */ while (*cp == '/') cp++; if (*cp == '\0') break; /* * Check that current node is a directory. */ if (! S_ISDIR(fp->f_di.di_mode)) { error = ENOTDIR; goto out; } /* * Get next component of path name. */ len = 0; ncp = cp; while ((c = *cp) != '\0' && c != '/') { if (++len > EXT2_MAXNAMLEN) { error = ENOENT; goto out; } cp++; } *cp = '\0'; /* * Look up component in current directory. * Save directory inumber in case we find a * symbolic link. */ parent_inumber = inumber; error = search_directory(ncp, f, &inumber); *cp = c; if (error) goto out; /* * Open next component. */ if ((error = read_inode(inumber, f)) != 0) goto out; /* * Check for symbolic link. */ if (S_ISLNK(fp->f_di.di_mode)) { int link_len = fp->f_di.di_size; int len; len = strlen(cp); if (link_len + len > MAXPATHLEN || ++nlinks > MAXSYMLINKS) { error = ENOENT; goto out; } bcopy(cp, &namebuf[link_len], len + 1); if (fp->f_di.di_nblk == 0) { bcopy(fp->f_di.di_shortlink, namebuf, link_len); } else { /* * Read file for symbolic link */ struct ext2fs *fs = fp->f_fs; daddr_t disk_block; size_t buf_size; if (! buf) buf = malloc(fs->fs_bsize); error = block_map(f, (daddr_t)0, &disk_block); if (error) goto out; twiddle(1); error = (f->f_dev->dv_strategy)(f->f_devdata, F_READ, fsb_to_db(fs, disk_block), fs->fs_bsize, buf, &buf_size); if (error) goto out; bcopy((char *)buf, namebuf, link_len); } /* * If relative pathname, restart at parent directory. * If absolute pathname, restart at root. */ cp = namebuf; if (*cp != '/') inumber = parent_inumber; else inumber = (ino_t)EXT2_ROOTINO; if ((error = read_inode(inumber, f)) != 0) goto out; } } /* * Found terminal component. */ error = 0; fp->f_seekp = 0; out: if (buf) free(buf); if (path) free(path); if (error) { if (fp->f_buf) free(fp->f_buf); free(fp->f_fs); free(fp); } return (error); } /* * Read a new inode into a file structure. */ static int read_inode(ino_t inumber, struct open_file *f) { struct file *fp = (struct file *)f->f_fsdata; struct ext2fs *fs = fp->f_fs; struct ext2dinode *dp; char *buf; size_t rsize; int level, error = 0; /* * Read inode and save it. */ buf = malloc(fs->fs_bsize); twiddle(1); error = (f->f_dev->dv_strategy)(f->f_devdata, F_READ, ino_to_db(fs, fp->f_bg, inumber), fs->fs_bsize, buf, &rsize); if (error) goto out; if (rsize != fs->fs_bsize) { error = EIO; goto out; } dp = (struct ext2dinode *)buf; fp->f_di = dp[ino_to_bo(fs, inumber)]; /* clear out old buffers */ - for (level = 0; level < NIADDR; level++) + for (level = 0; level < EXT2_NIADDR; level++) fp->f_blkno[level] = -1; fp->f_buf_blkno = -1; fp->f_seekp = 0; out: free(buf); return (error); } /* * Given an offset in a file, find the disk block number that * contains that block. */ static int block_map(struct open_file *f, daddr_t file_block, daddr_t *disk_block_p) { struct file *fp = (struct file *)f->f_fsdata; struct ext2fs *fs = fp->f_fs; daddr_t ind_block_num; int32_t *ind_p; int idx, level; int error; /* * Index structure of an inode: * - * di_db[0..NDADDR-1] hold block numbers for blocks - * 0..NDADDR-1 + * di_db[0..EXT2_NDADDR-1] hold block numbers for blocks + * 0..EXT2_NDADDR-1 * * di_ib[0] index block 0 is the single indirect block * holds block numbers for blocks - * NDADDR .. NDADDR + NINDIR(fs)-1 + * EXT2_NDADDR .. EXT2_NDADDR + NINDIR(fs)-1 * * di_ib[1] index block 1 is the double indirect block * holds block numbers for INDEX blocks for blocks - * NDADDR + NINDIR(fs) .. - * NDADDR + NINDIR(fs) + NINDIR(fs)**2 - 1 + * EXT2_NDADDR + NINDIR(fs) .. + * EXT2_NDADDR + NINDIR(fs) + NINDIR(fs)**2 - 1 * * di_ib[2] index block 2 is the triple indirect block * holds block numbers for double-indirect * blocks for blocks - * NDADDR + NINDIR(fs) + NINDIR(fs)**2 .. - * NDADDR + NINDIR(fs) + NINDIR(fs)**2 + * EXT2_NDADDR + NINDIR(fs) + NINDIR(fs)**2 .. + * EXT2_NDADDR + NINDIR(fs) + NINDIR(fs)**2 * + NINDIR(fs)**3 - 1 */ - if (file_block < NDADDR) { + if (file_block < EXT2_NDADDR) { /* Direct block. */ *disk_block_p = fp->f_di.di_db[file_block]; return (0); } - file_block -= NDADDR; + file_block -= EXT2_NDADDR; /* * nindir[0] = NINDIR * nindir[1] = NINDIR**2 * nindir[2] = NINDIR**3 * etc */ - for (level = 0; level < NIADDR; level++) { + for (level = 0; level < EXT2_NIADDR; level++) { if (file_block < fp->f_nindir[level]) break; file_block -= fp->f_nindir[level]; } - if (level == NIADDR) { + if (level == EXT2_NIADDR) { /* Block number too high */ return (EFBIG); } ind_block_num = fp->f_di.di_ib[level]; for (; level >= 0; level--) { if (ind_block_num == 0) { *disk_block_p = 0; /* missing */ return (0); } if (fp->f_blkno[level] != ind_block_num) { if (fp->f_blk[level] == (char *)0) fp->f_blk[level] = malloc(fs->fs_bsize); twiddle(1); error = (f->f_dev->dv_strategy)(f->f_devdata, F_READ, fsb_to_db(fp->f_fs, ind_block_num), fs->fs_bsize, fp->f_blk[level], &fp->f_blksize[level]); if (error) return (error); if (fp->f_blksize[level] != fs->fs_bsize) return (EIO); fp->f_blkno[level] = ind_block_num; } ind_p = (int32_t *)fp->f_blk[level]; if (level > 0) { idx = file_block / fp->f_nindir[level - 1]; file_block %= fp->f_nindir[level - 1]; } else { idx = file_block; } ind_block_num = ind_p[idx]; } *disk_block_p = ind_block_num; return (0); } /* * Read a portion of a file into an internal buffer. Return * the location in the buffer and the amount in the buffer. */ static int buf_read_file(struct open_file *f, char **buf_p, size_t *size_p) { struct file *fp = (struct file *)f->f_fsdata; struct ext2fs *fs = fp->f_fs; long off; daddr_t file_block; daddr_t disk_block; size_t block_size; int error = 0; off = blkoff(fs, fp->f_seekp); file_block = lblkno(fs, fp->f_seekp); block_size = dblksize(fs, &fp->f_di, file_block); if (file_block != fp->f_buf_blkno) { error = block_map(f, file_block, &disk_block); if (error) goto done; if (fp->f_buf == (char *)0) fp->f_buf = malloc(fs->fs_bsize); if (disk_block == 0) { bzero(fp->f_buf, block_size); fp->f_buf_size = block_size; } else { twiddle(4); error = (f->f_dev->dv_strategy)(f->f_devdata, F_READ, fsb_to_db(fs, disk_block), block_size, fp->f_buf, &fp->f_buf_size); if (error) goto done; } fp->f_buf_blkno = file_block; } /* * Return address of byte in buffer corresponding to * offset, and size of remainder of buffer after that * byte. */ *buf_p = fp->f_buf + off; *size_p = block_size - off; /* * But truncate buffer at end of file. */ if (*size_p > fp->f_di.di_size - fp->f_seekp) *size_p = fp->f_di.di_size - fp->f_seekp; done: return (error); } /* * Search a directory for a name and return its * i_number. */ static int search_directory(char *name, struct open_file *f, ino_t *inumber_p) { struct file *fp = (struct file *)f->f_fsdata; struct ext2dirent *dp, *edp; char *buf; size_t buf_size; int namlen, length; int error; length = strlen(name); fp->f_seekp = 0; while (fp->f_seekp < fp->f_di.di_size) { error = buf_read_file(f, &buf, &buf_size); if (error) return (error); dp = (struct ext2dirent *)buf; edp = (struct ext2dirent *)(buf + buf_size); while (dp < edp) { if (dp->d_ino == (ino_t)0) goto next; namlen = dp->d_namlen; if (namlen == length && strncmp(name, dp->d_name, length) == 0) { /* found entry */ *inumber_p = dp->d_ino; return (0); } next: dp = (struct ext2dirent *)((char *)dp + dp->d_reclen); } fp->f_seekp += buf_size; } return (ENOENT); } static int ext2fs_close(struct open_file *f) { struct file *fp = (struct file *)f->f_fsdata; int level; f->f_fsdata = (void *)0; if (fp == (struct file *)0) return (0); - for (level = 0; level < NIADDR; level++) { + for (level = 0; level < EXT2_NIADDR; level++) { if (fp->f_blk[level]) free(fp->f_blk[level]); } if (fp->f_buf) free(fp->f_buf); if (fp->f_bg) free(fp->f_bg); free(fp->f_fs); free(fp); return (0); } static int ext2fs_read(struct open_file *f, void *addr, size_t size, size_t *resid) { struct file *fp = (struct file *)f->f_fsdata; size_t csize, buf_size; char *buf; int error = 0; while (size != 0) { if (fp->f_seekp >= fp->f_di.di_size) break; error = buf_read_file(f, &buf, &buf_size); if (error) break; csize = size; if (csize > buf_size) csize = buf_size; bcopy(buf, addr, csize); fp->f_seekp += csize; addr = (char *)addr + csize; size -= csize; } if (resid) *resid = size; return (error); } static off_t ext2fs_seek(struct open_file *f, off_t offset, int where) { struct file *fp = (struct file *)f->f_fsdata; switch (where) { case SEEK_SET: fp->f_seekp = offset; break; case SEEK_CUR: fp->f_seekp += offset; break; case SEEK_END: fp->f_seekp = fp->f_di.di_size - offset; break; default: errno = EINVAL; return (-1); } return (fp->f_seekp); } static int ext2fs_stat(struct open_file *f, struct stat *sb) { struct file *fp = (struct file *)f->f_fsdata; /* only important stuff */ sb->st_mode = fp->f_di.di_mode; sb->st_uid = fp->f_di.di_uid; sb->st_gid = fp->f_di.di_gid; sb->st_size = fp->f_di.di_size; return (0); } static int ext2fs_readdir(struct open_file *f, struct dirent *d) { struct file *fp = (struct file *)f->f_fsdata; struct ext2dirent *ed; char *buf; size_t buf_size; int error; /* * assume that a directory entry will not be split across blocks */ again: if (fp->f_seekp >= fp->f_di.di_size) return (ENOENT); error = buf_read_file(f, &buf, &buf_size); if (error) return (error); ed = (struct ext2dirent *)buf; fp->f_seekp += ed->d_reclen; if (ed->d_ino == (ino_t)0) goto again; d->d_type = EXTFTODT(ed->d_type); strncpy(d->d_name, ed->d_name, ed->d_namlen); d->d_name[ed->d_namlen] = '\0'; return (0); } Index: head/lib/libstand/nandfs.c =================================================================== --- head/lib/libstand/nandfs.c (revision 313779) +++ head/lib/libstand/nandfs.c (revision 313780) @@ -1,1061 +1,1061 @@ /*- * Copyright (c) 2010-2012 Semihalf. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include "stand.h" #include "string.h" #include "zlib.h" #define DEBUG #undef DEBUG #ifdef DEBUG #define NANDFS_DEBUG(fmt, args...) do { \ printf("NANDFS_DEBUG:" fmt "\n", ##args); } while (0) #else #define NANDFS_DEBUG(fmt, args...) #endif struct nandfs_mdt { uint32_t entries_per_block; uint32_t entries_per_group; uint32_t blocks_per_group; uint32_t groups_per_desc_block; /* desc is super group */ uint32_t blocks_per_desc_block; /* desc is super group */ }; struct bmap_buf { LIST_ENTRY(bmap_buf) list; nandfs_daddr_t blknr; uint64_t *map; }; struct nandfs_node { struct nandfs_inode *inode; LIST_HEAD(, bmap_buf) bmap_bufs; }; struct nandfs { int nf_blocksize; int nf_sectorsize; int nf_cpno; struct open_file *nf_file; struct nandfs_node *nf_opened_node; u_int nf_offset; uint8_t *nf_buf; int64_t nf_buf_blknr; struct nandfs_fsdata *nf_fsdata; struct nandfs_super_block *nf_sb; struct nandfs_segment_summary nf_segsum; struct nandfs_checkpoint nf_checkpoint; struct nandfs_super_root nf_sroot; struct nandfs_node nf_ifile; struct nandfs_node nf_datfile; struct nandfs_node nf_cpfile; struct nandfs_mdt nf_datfile_mdt; struct nandfs_mdt nf_ifile_mdt; - int nf_nindir[NIADDR]; + int nf_nindir[NANDFS_NIADDR]; }; static int nandfs_open(const char *, struct open_file *); static int nandfs_close(struct open_file *); static int nandfs_read(struct open_file *, void *, size_t, size_t *); static off_t nandfs_seek(struct open_file *, off_t, int); static int nandfs_stat(struct open_file *, struct stat *); static int nandfs_readdir(struct open_file *, struct dirent *); static int nandfs_buf_read(struct nandfs *, void **, size_t *); static struct nandfs_node *nandfs_lookup_path(struct nandfs *, const char *); static int nandfs_read_inode(struct nandfs *, struct nandfs_node *, nandfs_lbn_t, u_int, void *, int); static int nandfs_read_blk(struct nandfs *, nandfs_daddr_t, void *, int); static int nandfs_bmap_lookup(struct nandfs *, struct nandfs_node *, nandfs_lbn_t, nandfs_daddr_t *, int); static int nandfs_get_checkpoint(struct nandfs *, uint64_t, struct nandfs_checkpoint *); static nandfs_daddr_t nandfs_vtop(struct nandfs *, nandfs_daddr_t); static void nandfs_calc_mdt_consts(int, struct nandfs_mdt *, int); static void nandfs_mdt_trans(struct nandfs_mdt *, uint64_t, nandfs_daddr_t *, uint32_t *); static int ioread(struct open_file *, off_t, void *, u_int); static int nandfs_probe_sectorsize(struct open_file *); struct fs_ops nandfs_fsops = { "nandfs", nandfs_open, nandfs_close, nandfs_read, null_write, nandfs_seek, nandfs_stat, nandfs_readdir }; #define NINDIR(fs) ((fs)->nf_blocksize / sizeof(nandfs_daddr_t)) /* from NetBSD's src/sys/net/if_ethersubr.c */ static uint32_t nandfs_crc32(uint32_t crc, const uint8_t *buf, size_t len) { static const uint32_t crctab[] = { 0x00000000, 0x1db71064, 0x3b6e20c8, 0x26d930ac, 0x76dc4190, 0x6b6b51f4, 0x4db26158, 0x5005713c, 0xedb88320, 0xf00f9344, 0xd6d6a3e8, 0xcb61b38c, 0x9b64c2b0, 0x86d3d2d4, 0xa00ae278, 0xbdbdf21c }; size_t i; crc = crc ^ ~0U; for (i = 0; i < len; i++) { crc ^= buf[i]; crc = (crc >> 4) ^ crctab[crc & 0xf]; crc = (crc >> 4) ^ crctab[crc & 0xf]; } return (crc ^ ~0U); } static int nandfs_check_fsdata_crc(struct nandfs_fsdata *fsdata) { uint32_t fsdata_crc, comp_crc; if (fsdata->f_magic != NANDFS_FSDATA_MAGIC) return (0); /* Preserve crc */ fsdata_crc = fsdata->f_sum; /* Calculate */ fsdata->f_sum = (0); comp_crc = nandfs_crc32(0, (uint8_t *)fsdata, fsdata->f_bytes); /* Restore */ fsdata->f_sum = fsdata_crc; /* Check CRC */ return (fsdata_crc == comp_crc); } static int nandfs_check_superblock_crc(struct nandfs_fsdata *fsdata, struct nandfs_super_block *super) { uint32_t super_crc, comp_crc; /* Check super block magic */ if (super->s_magic != NANDFS_SUPER_MAGIC) return (0); /* Preserve CRC */ super_crc = super->s_sum; /* Calculate */ super->s_sum = (0); comp_crc = nandfs_crc32(0, (uint8_t *)super, fsdata->f_sbbytes); /* Restore */ super->s_sum = super_crc; /* Check CRC */ return (super_crc == comp_crc); } static int nandfs_find_super_block(struct nandfs *fs, struct open_file *f) { struct nandfs_super_block *sb; int i, j, n, s; int sectors_to_read, error; sb = malloc(fs->nf_sectorsize); if (sb == NULL) return (ENOMEM); memset(fs->nf_sb, 0, sizeof(*fs->nf_sb)); sectors_to_read = (NANDFS_NFSAREAS * fs->nf_fsdata->f_erasesize) / fs->nf_sectorsize; for (i = 0; i < sectors_to_read; i++) { NANDFS_DEBUG("reading i %d offset %d\n", i, i * fs->nf_sectorsize); error = ioread(f, i * fs->nf_sectorsize, (char *)sb, fs->nf_sectorsize); if (error) { NANDFS_DEBUG("error %d\n", error); continue; } n = fs->nf_sectorsize / sizeof(struct nandfs_super_block); s = 0; if ((i * fs->nf_sectorsize) % fs->nf_fsdata->f_erasesize == 0) { if (fs->nf_sectorsize == sizeof(struct nandfs_fsdata)) continue; else { s += (sizeof(struct nandfs_fsdata) / sizeof(struct nandfs_super_block)); } } for (j = s; j < n; j++) { if (!nandfs_check_superblock_crc(fs->nf_fsdata, &sb[j])) continue; NANDFS_DEBUG("magic %x wtime %jd, lastcp 0x%jx\n", sb[j].s_magic, sb[j].s_wtime, sb[j].s_last_cno); if (sb[j].s_last_cno > fs->nf_sb->s_last_cno) memcpy(fs->nf_sb, &sb[j], sizeof(*fs->nf_sb)); } } free(sb); return (fs->nf_sb->s_magic != 0 ? 0 : EINVAL); } static int nandfs_find_fsdata(struct nandfs *fs, struct open_file *f) { int offset, error, i; NANDFS_DEBUG("starting\n"); offset = 0; for (i = 0; i < 64 * NANDFS_NFSAREAS; i++) { error = ioread(f, offset, (char *)fs->nf_fsdata, sizeof(struct nandfs_fsdata)); if (error) return (error); if (fs->nf_fsdata->f_magic == NANDFS_FSDATA_MAGIC) { NANDFS_DEBUG("found at %x, volume %s\n", offset, fs->nf_fsdata->f_volume_name); if (nandfs_check_fsdata_crc(fs->nf_fsdata)) break; } offset += fs->nf_sectorsize; } return (error); } static int nandfs_read_structures(struct nandfs *fs, struct open_file *f) { int error; error = nandfs_find_fsdata(fs, f); if (error) return (error); error = nandfs_find_super_block(fs, f); if (error == 0) NANDFS_DEBUG("selected sb with w_time %jd last_pseg %jx\n", fs->nf_sb->s_wtime, fs->nf_sb->s_last_pseg); return (error); } static int nandfs_mount(struct nandfs *fs, struct open_file *f) { int err = 0, level; uint64_t last_pseg; fs->nf_fsdata = malloc(sizeof(struct nandfs_fsdata)); fs->nf_sb = malloc(sizeof(struct nandfs_super_block)); err = nandfs_read_structures(fs, f); if (err) { free(fs->nf_fsdata); free(fs->nf_sb); return (err); } fs->nf_blocksize = 1 << (fs->nf_fsdata->f_log_block_size + 10); NANDFS_DEBUG("using superblock with wtime %jd\n", fs->nf_sb->s_wtime); fs->nf_cpno = fs->nf_sb->s_last_cno; last_pseg = fs->nf_sb->s_last_pseg; /* * Calculate indirect block levels. */ nandfs_daddr_t mult; mult = 1; - for (level = 0; level < NIADDR; level++) { + for (level = 0; level < NANDFS_NIADDR; level++) { mult *= NINDIR(fs); fs->nf_nindir[level] = mult; } nandfs_calc_mdt_consts(fs->nf_blocksize, &fs->nf_datfile_mdt, fs->nf_fsdata->f_dat_entry_size); nandfs_calc_mdt_consts(fs->nf_blocksize, &fs->nf_ifile_mdt, fs->nf_fsdata->f_inode_size); err = ioread(f, last_pseg * fs->nf_blocksize, &fs->nf_segsum, sizeof(struct nandfs_segment_summary)); if (err) { free(fs->nf_sb); free(fs->nf_fsdata); return (err); } err = ioread(f, (last_pseg + fs->nf_segsum.ss_nblocks - 1) * fs->nf_blocksize, &fs->nf_sroot, sizeof(struct nandfs_super_root)); if (err) { free(fs->nf_sb); free(fs->nf_fsdata); return (err); } fs->nf_datfile.inode = &fs->nf_sroot.sr_dat; LIST_INIT(&fs->nf_datfile.bmap_bufs); fs->nf_cpfile.inode = &fs->nf_sroot.sr_cpfile; LIST_INIT(&fs->nf_cpfile.bmap_bufs); err = nandfs_get_checkpoint(fs, fs->nf_cpno, &fs->nf_checkpoint); if (err) { free(fs->nf_sb); free(fs->nf_fsdata); return (err); } NANDFS_DEBUG("checkpoint cp_cno=%lld\n", fs->nf_checkpoint.cp_cno); NANDFS_DEBUG("checkpoint cp_inodes_count=%lld\n", fs->nf_checkpoint.cp_inodes_count); NANDFS_DEBUG("checkpoint cp_ifile_inode.i_blocks=%lld\n", fs->nf_checkpoint.cp_ifile_inode.i_blocks); fs->nf_ifile.inode = &fs->nf_checkpoint.cp_ifile_inode; LIST_INIT(&fs->nf_ifile.bmap_bufs); return (0); } #define NINDIR(fs) ((fs)->nf_blocksize / sizeof(nandfs_daddr_t)) static int nandfs_open(const char *path, struct open_file *f) { struct nandfs *fs; struct nandfs_node *node; int err, bsize, level; NANDFS_DEBUG("nandfs_open('%s', %p)\n", path, f); fs = malloc(sizeof(struct nandfs)); f->f_fsdata = fs; fs->nf_file = f; bsize = nandfs_probe_sectorsize(f); if (bsize < 0) { printf("Cannot probe medium sector size\n"); return (EINVAL); } fs->nf_sectorsize = bsize; /* * Calculate indirect block levels. */ nandfs_daddr_t mult; mult = 1; - for (level = 0; level < NIADDR; level++) { + for (level = 0; level < NANDFS_NIADDR; level++) { mult *= NINDIR(fs); fs->nf_nindir[level] = mult; } NANDFS_DEBUG("fs %p nf_sectorsize=%x\n", fs, fs->nf_sectorsize); err = nandfs_mount(fs, f); if (err) { NANDFS_DEBUG("Cannot mount nandfs: %s\n", strerror(err)); return (err); } node = nandfs_lookup_path(fs, path); if (node == NULL) return (EINVAL); fs->nf_offset = 0; fs->nf_buf = NULL; fs->nf_buf_blknr = -1; fs->nf_opened_node = node; LIST_INIT(&fs->nf_opened_node->bmap_bufs); return (0); } static void nandfs_free_node(struct nandfs_node *node) { struct bmap_buf *bmap, *tmp; free(node->inode); LIST_FOREACH_SAFE(bmap, &node->bmap_bufs, list, tmp) { LIST_REMOVE(bmap, list); free(bmap->map); free(bmap); } free(node); } static int nandfs_close(struct open_file *f) { struct nandfs *fs = f->f_fsdata; NANDFS_DEBUG("nandfs_close(%p)\n", f); if (fs->nf_buf != NULL) free(fs->nf_buf); nandfs_free_node(fs->nf_opened_node); free(fs->nf_sb); free(fs); return (0); } static int nandfs_read(struct open_file *f, void *addr, size_t size, size_t *resid) { struct nandfs *fs = (struct nandfs *)f->f_fsdata; size_t csize, buf_size; void *buf; int error = 0; NANDFS_DEBUG("nandfs_read(file=%p, addr=%p, size=%d)\n", f, addr, size); while (size != 0) { if (fs->nf_offset >= fs->nf_opened_node->inode->i_size) break; error = nandfs_buf_read(fs, &buf, &buf_size); if (error) break; csize = size; if (csize > buf_size) csize = buf_size; bcopy(buf, addr, csize); fs->nf_offset += csize; addr = (char *)addr + csize; size -= csize; } if (resid) *resid = size; return (error); } static off_t nandfs_seek(struct open_file *f, off_t offset, int where) { struct nandfs *fs = f->f_fsdata; off_t off; u_int size; NANDFS_DEBUG("nandfs_seek(file=%p, offset=%lld, where=%d)\n", f, offset, where); size = fs->nf_opened_node->inode->i_size; switch (where) { case SEEK_SET: off = 0; break; case SEEK_CUR: off = fs->nf_offset; break; case SEEK_END: off = size; break; default: errno = EINVAL; return (-1); } off += offset; if (off < 0 || off > size) { errno = EINVAL; return(-1); } fs->nf_offset = (u_int)off; return (off); } static int nandfs_stat(struct open_file *f, struct stat *sb) { struct nandfs *fs = f->f_fsdata; NANDFS_DEBUG("nandfs_stat(file=%p, stat=%p)\n", f, sb); sb->st_size = fs->nf_opened_node->inode->i_size; sb->st_mode = fs->nf_opened_node->inode->i_mode; sb->st_uid = fs->nf_opened_node->inode->i_uid; sb->st_gid = fs->nf_opened_node->inode->i_gid; return (0); } static int nandfs_readdir(struct open_file *f, struct dirent *d) { struct nandfs *fs = f->f_fsdata; struct nandfs_dir_entry *dirent; void *buf; size_t buf_size; NANDFS_DEBUG("nandfs_readdir(file=%p, dirent=%p)\n", f, d); if (fs->nf_offset >= fs->nf_opened_node->inode->i_size) { NANDFS_DEBUG("nandfs_readdir(file=%p, dirent=%p) ENOENT\n", f, d); return (ENOENT); } if (nandfs_buf_read(fs, &buf, &buf_size)) { NANDFS_DEBUG("nandfs_readdir(file=%p, dirent=%p)" "buf_read failed\n", f, d); return (EIO); } NANDFS_DEBUG("nandfs_readdir(file=%p, dirent=%p) moving forward\n", f, d); dirent = (struct nandfs_dir_entry *)buf; fs->nf_offset += dirent->rec_len; strncpy(d->d_name, dirent->name, dirent->name_len); d->d_name[dirent->name_len] = '\0'; d->d_type = dirent->file_type; return (0); } static int nandfs_buf_read(struct nandfs *fs, void **buf_p, size_t *size_p) { nandfs_daddr_t blknr, blkoff; blknr = fs->nf_offset / fs->nf_blocksize; blkoff = fs->nf_offset % fs->nf_blocksize; if (blknr != fs->nf_buf_blknr) { if (fs->nf_buf == NULL) fs->nf_buf = malloc(fs->nf_blocksize); if (nandfs_read_inode(fs, fs->nf_opened_node, blknr, 1, fs->nf_buf, 0)) return (EIO); fs->nf_buf_blknr = blknr; } *buf_p = fs->nf_buf + blkoff; *size_p = fs->nf_blocksize - blkoff; NANDFS_DEBUG("nandfs_buf_read buf_p=%p size_p=%d\n", *buf_p, *size_p); if (*size_p > fs->nf_opened_node->inode->i_size - fs->nf_offset) *size_p = fs->nf_opened_node->inode->i_size - fs->nf_offset; return (0); } static struct nandfs_node * nandfs_lookup_node(struct nandfs *fs, uint64_t ino) { uint64_t blocknr; int entrynr; struct nandfs_inode *buffer; struct nandfs_node *node; struct nandfs_inode *inode; NANDFS_DEBUG("nandfs_lookup_node ino=%lld\n", ino); if (ino == 0) { printf("nandfs_lookup_node: invalid inode requested\n"); return (NULL); } buffer = malloc(fs->nf_blocksize); inode = malloc(sizeof(struct nandfs_inode)); node = malloc(sizeof(struct nandfs_node)); nandfs_mdt_trans(&fs->nf_ifile_mdt, ino, &blocknr, &entrynr); if (nandfs_read_inode(fs, &fs->nf_ifile, blocknr, 1, buffer, 0)) return (NULL); memcpy(inode, &buffer[entrynr], sizeof(struct nandfs_inode)); node->inode = inode; free(buffer); return (node); } static struct nandfs_node * nandfs_lookup_path(struct nandfs *fs, const char *path) { struct nandfs_node *node; struct nandfs_dir_entry *dirent; char *namebuf; uint64_t i, done, pinode, inode; int nlinks = 0, counter, len, link_len, nameidx; uint8_t *buffer, *orig; char *strp, *lpath; buffer = malloc(fs->nf_blocksize); orig = buffer; namebuf = malloc(2 * MAXPATHLEN + 2); strncpy(namebuf, path, MAXPATHLEN); namebuf[MAXPATHLEN] = '\0'; done = nameidx = 0; lpath = namebuf; /* Get the root inode */ node = nandfs_lookup_node(fs, NANDFS_ROOT_INO); inode = NANDFS_ROOT_INO; while ((strp = strsep(&lpath, "/")) != NULL) { if (*strp == '\0') continue; if ((node->inode->i_mode & IFMT) != IFDIR) { nandfs_free_node(node); node = NULL; goto out; } len = strlen(strp); NANDFS_DEBUG("%s: looking for %s\n", __func__, strp); for (i = 0; i < node->inode->i_blocks; i++) { if (nandfs_read_inode(fs, node, i, 1, orig, 0)) { node = NULL; goto out; } buffer = orig; done = counter = 0; while (1) { dirent = (struct nandfs_dir_entry *)(void *)buffer; NANDFS_DEBUG("%s: dirent.name = %s\n", __func__, dirent->name); NANDFS_DEBUG("%s: dirent.rec_len = %d\n", __func__, dirent->rec_len); NANDFS_DEBUG("%s: dirent.inode = %lld\n", __func__, dirent->inode); if (len == dirent->name_len && (strncmp(strp, dirent->name, len) == 0) && dirent->inode != 0) { nandfs_free_node(node); node = nandfs_lookup_node(fs, dirent->inode); pinode = inode; inode = dirent->inode; done = 1; break; } counter += dirent->rec_len; buffer += dirent->rec_len; if (counter == fs->nf_blocksize) break; } if (done) break; } if (!done) { node = NULL; goto out; } NANDFS_DEBUG("%s: %.*s has mode %o\n", __func__, dirent->name_len, dirent->name, node->inode->i_mode); if ((node->inode->i_mode & IFMT) == IFLNK) { NANDFS_DEBUG("%s: %.*s is symlink\n", __func__, dirent->name_len, dirent->name); link_len = node->inode->i_size; if (++nlinks > MAXSYMLINKS) { nandfs_free_node(node); node = NULL; goto out; } if (nandfs_read_inode(fs, node, 0, 1, orig, 0)) { nandfs_free_node(node); node = NULL; goto out; } NANDFS_DEBUG("%s: symlink is %.*s\n", __func__, link_len, (char *)orig); nameidx = (nameidx == 0) ? MAXPATHLEN + 1 : 0; bcopy((char *)orig, namebuf + nameidx, (unsigned)link_len); if (lpath != NULL) { namebuf[nameidx + link_len++] = '/'; strncpy(namebuf + nameidx + link_len, lpath, MAXPATHLEN - link_len); namebuf[nameidx + MAXPATHLEN] = '\0'; } else namebuf[nameidx + link_len] = '\0'; NANDFS_DEBUG("%s: strp=%s, lpath=%s, namebuf0=%s, " "namebuf1=%s, idx=%d\n", __func__, strp, lpath, namebuf + 0, namebuf + MAXPATHLEN + 1, nameidx); lpath = namebuf + nameidx; nandfs_free_node(node); /* * If absolute pathname, restart at root. Otherwise * continue with out parent inode. */ inode = (orig[0] == '/') ? NANDFS_ROOT_INO : pinode; node = nandfs_lookup_node(fs, inode); } } out: free(namebuf); free(orig); return (node); } static int nandfs_read_inode(struct nandfs *fs, struct nandfs_node *node, nandfs_daddr_t blknr, u_int nblks, void *buf, int raw) { uint64_t *pblks; uint64_t *vblks; u_int i; int error; pblks = malloc(nblks * sizeof(uint64_t)); vblks = malloc(nblks * sizeof(uint64_t)); NANDFS_DEBUG("nandfs_read_inode fs=%p node=%p blknr=%lld nblks=%d\n", fs, node, blknr, nblks); for (i = 0; i < nblks; i++) { error = nandfs_bmap_lookup(fs, node, blknr + i, &vblks[i], raw); if (error) { free(pblks); free(vblks); return (error); } if (raw == 0) pblks[i] = nandfs_vtop(fs, vblks[i]); else pblks[i] = vblks[i]; } for (i = 0; i < nblks; i++) { if (ioread(fs->nf_file, pblks[i] * fs->nf_blocksize, buf, fs->nf_blocksize)) { free(pblks); free(vblks); return (EIO); } buf = (void *)((uintptr_t)buf + fs->nf_blocksize); } free(pblks); free(vblks); return (0); } static int nandfs_read_blk(struct nandfs *fs, nandfs_daddr_t blknr, void *buf, int phys) { uint64_t pblknr; pblknr = (phys ? blknr : nandfs_vtop(fs, blknr)); return (ioread(fs->nf_file, pblknr * fs->nf_blocksize, buf, fs->nf_blocksize)); } static int nandfs_get_checkpoint(struct nandfs *fs, uint64_t cpno, struct nandfs_checkpoint *cp) { uint64_t blocknr; int blockoff, cp_per_block, dlen; uint8_t *buf; NANDFS_DEBUG("nandfs_get_checkpoint(fs=%p cpno=%lld)\n", fs, cpno); buf = malloc(fs->nf_blocksize); cpno += NANDFS_CPFILE_FIRST_CHECKPOINT_OFFSET - 1; dlen = fs->nf_fsdata->f_checkpoint_size; cp_per_block = fs->nf_blocksize / dlen; blocknr = cpno / cp_per_block; blockoff = (cpno % cp_per_block) * dlen; if (nandfs_read_inode(fs, &fs->nf_cpfile, blocknr, 1, buf, 0)) { free(buf); return (EINVAL); } memcpy(cp, buf + blockoff, sizeof(struct nandfs_checkpoint)); free(buf); return (0); } static uint64_t * nandfs_get_map(struct nandfs *fs, struct nandfs_node *node, nandfs_daddr_t blknr, int phys) { struct bmap_buf *bmap; uint64_t *map; LIST_FOREACH(bmap, &node->bmap_bufs, list) { if (bmap->blknr == blknr) return (bmap->map); } map = malloc(fs->nf_blocksize); if (nandfs_read_blk(fs, blknr, map, phys)) { free(map); return (NULL); } bmap = malloc(sizeof(struct bmap_buf)); bmap->blknr = blknr; bmap->map = map; LIST_INSERT_HEAD(&node->bmap_bufs, bmap, list); NANDFS_DEBUG("%s:(node=%p, map=%p)\n", __func__, node, map); return (map); } static int nandfs_bmap_lookup(struct nandfs *fs, struct nandfs_node *node, nandfs_lbn_t lblknr, nandfs_daddr_t *vblknr, int phys) { struct nandfs_inode *ino; nandfs_daddr_t ind_block_num; uint64_t *map; int idx; int level; ino = node->inode; - if (lblknr < NDADDR) { + if (lblknr < NANDFS_NDADDR) { *vblknr = ino->i_db[lblknr]; return (0); } - lblknr -= NDADDR; + lblknr -= NANDFS_NDADDR; /* * nindir[0] = NINDIR * nindir[1] = NINDIR**2 * nindir[2] = NINDIR**3 * etc */ - for (level = 0; level < NIADDR; level++) { + for (level = 0; level < NANDFS_NIADDR; level++) { NANDFS_DEBUG("lblknr=%jx fs->nf_nindir[%d]=%d\n", lblknr, level, fs->nf_nindir[level]); if (lblknr < fs->nf_nindir[level]) break; lblknr -= fs->nf_nindir[level]; } - if (level == NIADDR) { + if (level == NANDFS_NIADDR) { /* Block number too high */ NANDFS_DEBUG("lblknr %jx too high\n", lblknr); return (EFBIG); } ind_block_num = ino->i_ib[level]; for (; level >= 0; level--) { if (ind_block_num == 0) { *vblknr = 0; /* missing */ return (0); } twiddle(1); NANDFS_DEBUG("calling get_map with %jx\n", ind_block_num); map = nandfs_get_map(fs, node, ind_block_num, phys); if (map == NULL) return (EIO); if (level > 0) { idx = lblknr / fs->nf_nindir[level - 1]; lblknr %= fs->nf_nindir[level - 1]; } else idx = lblknr; ind_block_num = ((nandfs_daddr_t *)map)[idx]; } *vblknr = ind_block_num; return (0); } static nandfs_daddr_t nandfs_vtop(struct nandfs *fs, nandfs_daddr_t vblocknr) { nandfs_lbn_t blocknr; nandfs_daddr_t pblocknr; int entrynr; struct nandfs_dat_entry *dat; dat = malloc(fs->nf_blocksize); nandfs_mdt_trans(&fs->nf_datfile_mdt, vblocknr, &blocknr, &entrynr); if (nandfs_read_inode(fs, &fs->nf_datfile, blocknr, 1, dat, 1)) { free(dat); return (0); } NANDFS_DEBUG("nandfs_vtop entrynr=%d vblocknr=%lld pblocknr=%lld\n", entrynr, vblocknr, dat[entrynr].de_blocknr); pblocknr = dat[entrynr].de_blocknr; free(dat); return (pblocknr); } static void nandfs_calc_mdt_consts(int blocksize, struct nandfs_mdt *mdt, int entry_size) { mdt->entries_per_group = blocksize * 8; /* bits in sector */ mdt->entries_per_block = blocksize / entry_size; mdt->blocks_per_group = (mdt->entries_per_group -1) / mdt->entries_per_block + 1 + 1; mdt->groups_per_desc_block = blocksize / sizeof(struct nandfs_block_group_desc); mdt->blocks_per_desc_block = mdt->groups_per_desc_block * mdt->blocks_per_group + 1; } static void nandfs_mdt_trans(struct nandfs_mdt *mdt, uint64_t index, nandfs_daddr_t *blocknr, uint32_t *entry_in_block) { nandfs_daddr_t blknr; uint64_t group, group_offset, blocknr_in_group; uint64_t desc_block, desc_offset; /* Calculate our offset in the file */ group = index / mdt->entries_per_group; group_offset = index % mdt->entries_per_group; desc_block = group / mdt->groups_per_desc_block; desc_offset = group % mdt->groups_per_desc_block; blocknr_in_group = group_offset / mdt->entries_per_block; /* To descgroup offset */ blknr = 1 + desc_block * mdt->blocks_per_desc_block; /* To group offset */ blknr += desc_offset * mdt->blocks_per_group; /* To actual file block */ blknr += 1 + blocknr_in_group; *blocknr = blknr; *entry_in_block = group_offset % mdt->entries_per_block; } static int ioread(struct open_file *f, off_t pos, void *buf, u_int length) { void *buffer; int err; int bsize = ((struct nandfs *)f->f_fsdata)->nf_sectorsize; u_int off, nsec; off = pos % bsize; pos /= bsize; nsec = howmany(length, bsize); NANDFS_DEBUG("pos=%lld length=%d off=%d nsec=%d\n", pos, length, off, nsec); buffer = malloc(nsec * bsize); err = (f->f_dev->dv_strategy)(f->f_devdata, F_READ, pos, nsec * bsize, buffer, NULL); memcpy(buf, (void *)((uintptr_t)buffer + off), length); free(buffer); return (err); } static int nandfs_probe_sectorsize(struct open_file *f) { void *buffer; int i, err; buffer = malloc(16 * 1024); NANDFS_DEBUG("probing for sector size: "); for (i = 512; i < (16 * 1024); i <<= 1) { NANDFS_DEBUG("%d ", i); err = (f->f_dev->dv_strategy)(f->f_devdata, F_READ, 0, i, buffer, NULL); if (err == 0) { NANDFS_DEBUG("found"); free(buffer); return (i); } } free(buffer); NANDFS_DEBUG("not found\n"); return (-1); } Index: head/lib/libstand/ufs.c =================================================================== --- head/lib/libstand/ufs.c (revision 313779) +++ head/lib/libstand/ufs.c (revision 313780) @@ -1,861 +1,861 @@ /* $NetBSD: ufs.c,v 1.20 1998/03/01 07:15:39 ross Exp $ */ /*- * Copyright (c) 2002 Networks Associates Technology, Inc. * All rights reserved. * * This software was developed for the FreeBSD Project by Marshall * Kirk McKusick and Network Associates Laboratories, the Security * Research Division of Network Associates, Inc. under DARPA/SPAWAR * contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA CHATS * research program * * Copyright (c) 1982, 1989, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * The Mach Operating System project at Carnegie-Mellon University. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * * Copyright (c) 1990, 1991 Carnegie Mellon University * All Rights Reserved. * * Author: David Golub * * Permission to use, copy, modify and distribute this software and its * documentation is hereby granted, provided that both the copyright * notice and this permission notice appear in all copies of the * software, derivative works or modified versions, and any portions * thereof, and that both notices appear in supporting documentation. * * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. * * Carnegie Mellon requests users of this software to return to * * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU * School of Computer Science * Carnegie Mellon University * Pittsburgh PA 15213-3890 * * any improvements or extensions that they make and grant Carnegie the * rights to redistribute these changes. */ #include __FBSDID("$FreeBSD$"); /* * Stand-alone file reading package. */ #include #include #include #include #include #include #include "stand.h" #include "string.h" static int ufs_open(const char *path, struct open_file *f); static int ufs_write(struct open_file *f, void *buf, size_t size, size_t *resid); static int ufs_close(struct open_file *f); static int ufs_read(struct open_file *f, void *buf, size_t size, size_t *resid); static off_t ufs_seek(struct open_file *f, off_t offset, int where); static int ufs_stat(struct open_file *f, struct stat *sb); static int ufs_readdir(struct open_file *f, struct dirent *d); struct fs_ops ufs_fsops = { "ufs", ufs_open, ufs_close, ufs_read, ufs_write, ufs_seek, ufs_stat, ufs_readdir }; /* * In-core open file. */ struct file { off_t f_seekp; /* seek pointer */ struct fs *f_fs; /* pointer to super-block */ union dinode { struct ufs1_dinode di1; struct ufs2_dinode di2; } f_di; /* copy of on-disk inode */ - int f_nindir[NIADDR]; + int f_nindir[UFS_NIADDR]; /* number of blocks mapped by indirect block at level i */ - char *f_blk[NIADDR]; /* buffer for indirect block at + char *f_blk[UFS_NIADDR]; /* buffer for indirect block at level i */ - size_t f_blksize[NIADDR]; + size_t f_blksize[UFS_NIADDR]; /* size of buffer */ - ufs2_daddr_t f_blkno[NIADDR];/* disk address of block in buffer */ + ufs2_daddr_t f_blkno[UFS_NIADDR];/* disk address of block in buffer */ ufs2_daddr_t f_buf_blkno; /* block number of data block */ char *f_buf; /* buffer for data block */ size_t f_buf_size; /* size of data block */ }; #define DIP(fp, field) \ ((fp)->f_fs->fs_magic == FS_UFS1_MAGIC ? \ (fp)->f_di.di1.field : (fp)->f_di.di2.field) static int read_inode(ino_t, struct open_file *); static int block_map(struct open_file *, ufs2_daddr_t, ufs2_daddr_t *); static int buf_read_file(struct open_file *, char **, size_t *); static int buf_write_file(struct open_file *, char *, size_t *); static int search_directory(char *, struct open_file *, ino_t *); /* * Read a new inode into a file structure. */ static int read_inode(inumber, f) ino_t inumber; struct open_file *f; { struct file *fp = (struct file *)f->f_fsdata; struct fs *fs = fp->f_fs; char *buf; size_t rsize; int rc; if (fs == NULL) panic("fs == NULL"); /* * Read inode and save it. */ buf = malloc(fs->fs_bsize); twiddle(1); rc = (f->f_dev->dv_strategy)(f->f_devdata, F_READ, fsbtodb(fs, ino_to_fsba(fs, inumber)), fs->fs_bsize, buf, &rsize); if (rc) goto out; if (rsize != fs->fs_bsize) { rc = EIO; goto out; } if (fp->f_fs->fs_magic == FS_UFS1_MAGIC) fp->f_di.di1 = ((struct ufs1_dinode *)buf) [ino_to_fsbo(fs, inumber)]; else fp->f_di.di2 = ((struct ufs2_dinode *)buf) [ino_to_fsbo(fs, inumber)]; /* * Clear out the old buffers */ { int level; - for (level = 0; level < NIADDR; level++) + for (level = 0; level < UFS_NIADDR; level++) fp->f_blkno[level] = -1; fp->f_buf_blkno = -1; } fp->f_seekp = 0; out: free(buf); return (rc); } /* * Given an offset in a file, find the disk block number that * contains that block. */ static int block_map(f, file_block, disk_block_p) struct open_file *f; ufs2_daddr_t file_block; ufs2_daddr_t *disk_block_p; /* out */ { struct file *fp = (struct file *)f->f_fsdata; struct fs *fs = fp->f_fs; int level; int idx; ufs2_daddr_t ind_block_num; int rc; /* * Index structure of an inode: * - * di_db[0..NDADDR-1] hold block numbers for blocks - * 0..NDADDR-1 + * di_db[0..UFS_NDADDR-1] hold block numbers for blocks + * 0..UFS_NDADDR-1 * * di_ib[0] index block 0 is the single indirect block * holds block numbers for blocks - * NDADDR .. NDADDR + NINDIR(fs)-1 + * UFS_NDADDR .. UFS_NDADDR + NINDIR(fs)-1 * * di_ib[1] index block 1 is the double indirect block * holds block numbers for INDEX blocks for blocks - * NDADDR + NINDIR(fs) .. - * NDADDR + NINDIR(fs) + NINDIR(fs)**2 - 1 + * UFS_NDADDR + NINDIR(fs) .. + * UFS_NDADDR + NINDIR(fs) + NINDIR(fs)**2 - 1 * * di_ib[2] index block 2 is the triple indirect block * holds block numbers for double-indirect * blocks for blocks - * NDADDR + NINDIR(fs) + NINDIR(fs)**2 .. - * NDADDR + NINDIR(fs) + NINDIR(fs)**2 + * UFS_NDADDR + NINDIR(fs) + NINDIR(fs)**2 .. + * UFS_NDADDR + NINDIR(fs) + NINDIR(fs)**2 * + NINDIR(fs)**3 - 1 */ - if (file_block < NDADDR) { + if (file_block < UFS_NDADDR) { /* Direct block. */ *disk_block_p = DIP(fp, di_db[file_block]); return (0); } - file_block -= NDADDR; + file_block -= UFS_NDADDR; /* * nindir[0] = NINDIR * nindir[1] = NINDIR**2 * nindir[2] = NINDIR**3 * etc */ - for (level = 0; level < NIADDR; level++) { + for (level = 0; level < UFS_NIADDR; level++) { if (file_block < fp->f_nindir[level]) break; file_block -= fp->f_nindir[level]; } - if (level == NIADDR) { + if (level == UFS_NIADDR) { /* Block number too high */ return (EFBIG); } ind_block_num = DIP(fp, di_ib[level]); for (; level >= 0; level--) { if (ind_block_num == 0) { *disk_block_p = 0; /* missing */ return (0); } if (fp->f_blkno[level] != ind_block_num) { if (fp->f_blk[level] == (char *)0) fp->f_blk[level] = malloc(fs->fs_bsize); twiddle(1); rc = (f->f_dev->dv_strategy)(f->f_devdata, F_READ, fsbtodb(fp->f_fs, ind_block_num), fs->fs_bsize, fp->f_blk[level], &fp->f_blksize[level]); if (rc) return (rc); if (fp->f_blksize[level] != fs->fs_bsize) return (EIO); fp->f_blkno[level] = ind_block_num; } if (level > 0) { idx = file_block / fp->f_nindir[level - 1]; file_block %= fp->f_nindir[level - 1]; } else idx = file_block; if (fp->f_fs->fs_magic == FS_UFS1_MAGIC) ind_block_num = ((ufs1_daddr_t *)fp->f_blk[level])[idx]; else ind_block_num = ((ufs2_daddr_t *)fp->f_blk[level])[idx]; } *disk_block_p = ind_block_num; return (0); } /* * Write a portion of a file from an internal buffer. */ static int buf_write_file(f, buf_p, size_p) struct open_file *f; char *buf_p; size_t *size_p; /* out */ { struct file *fp = (struct file *)f->f_fsdata; struct fs *fs = fp->f_fs; long off; ufs_lbn_t file_block; ufs2_daddr_t disk_block; size_t block_size; int rc; /* * Calculate the starting block address and offset. */ off = blkoff(fs, fp->f_seekp); file_block = lblkno(fs, fp->f_seekp); block_size = sblksize(fs, DIP(fp, di_size), file_block); rc = block_map(f, file_block, &disk_block); if (rc) return (rc); if (disk_block == 0) /* Because we can't allocate space on the drive */ return (EFBIG); /* * Truncate buffer at end of file, and at the end of * this block. */ if (*size_p > DIP(fp, di_size) - fp->f_seekp) *size_p = DIP(fp, di_size) - fp->f_seekp; if (*size_p > block_size - off) *size_p = block_size - off; /* * If we don't entirely occlude the block and it's not * in memory already, read it in first. */ if (((off > 0) || (*size_p + off < block_size)) && (file_block != fp->f_buf_blkno)) { if (fp->f_buf == (char *)0) fp->f_buf = malloc(fs->fs_bsize); twiddle(4); rc = (f->f_dev->dv_strategy)(f->f_devdata, F_READ, fsbtodb(fs, disk_block), block_size, fp->f_buf, &fp->f_buf_size); if (rc) return (rc); fp->f_buf_blkno = file_block; } /* * Copy the user data into the cached block. */ bcopy(buf_p, fp->f_buf + off, *size_p); /* * Write the block out to storage. */ twiddle(4); rc = (f->f_dev->dv_strategy)(f->f_devdata, F_WRITE, fsbtodb(fs, disk_block), block_size, fp->f_buf, &fp->f_buf_size); return (rc); } /* * Read a portion of a file into an internal buffer. Return * the location in the buffer and the amount in the buffer. */ static int buf_read_file(f, buf_p, size_p) struct open_file *f; char **buf_p; /* out */ size_t *size_p; /* out */ { struct file *fp = (struct file *)f->f_fsdata; struct fs *fs = fp->f_fs; long off; ufs_lbn_t file_block; ufs2_daddr_t disk_block; size_t block_size; int rc; off = blkoff(fs, fp->f_seekp); file_block = lblkno(fs, fp->f_seekp); block_size = sblksize(fs, DIP(fp, di_size), file_block); if (file_block != fp->f_buf_blkno) { if (fp->f_buf == (char *)0) fp->f_buf = malloc(fs->fs_bsize); rc = block_map(f, file_block, &disk_block); if (rc) return (rc); if (disk_block == 0) { bzero(fp->f_buf, block_size); fp->f_buf_size = block_size; } else { twiddle(4); rc = (f->f_dev->dv_strategy)(f->f_devdata, F_READ, fsbtodb(fs, disk_block), block_size, fp->f_buf, &fp->f_buf_size); if (rc) return (rc); } fp->f_buf_blkno = file_block; } /* * Return address of byte in buffer corresponding to * offset, and size of remainder of buffer after that * byte. */ *buf_p = fp->f_buf + off; *size_p = block_size - off; /* * But truncate buffer at end of file. */ if (*size_p > DIP(fp, di_size) - fp->f_seekp) *size_p = DIP(fp, di_size) - fp->f_seekp; return (0); } /* * Search a directory for a name and return its * i_number. */ static int search_directory(name, f, inumber_p) char *name; struct open_file *f; ino_t *inumber_p; /* out */ { struct file *fp = (struct file *)f->f_fsdata; struct direct *dp; struct direct *edp; char *buf; size_t buf_size; int namlen, length; int rc; length = strlen(name); fp->f_seekp = 0; while (fp->f_seekp < DIP(fp, di_size)) { rc = buf_read_file(f, &buf, &buf_size); if (rc) return (rc); dp = (struct direct *)buf; edp = (struct direct *)(buf + buf_size); while (dp < edp) { if (dp->d_ino == (ino_t)0) goto next; #if BYTE_ORDER == LITTLE_ENDIAN if (fp->f_fs->fs_maxsymlinklen <= 0) namlen = dp->d_type; else #endif namlen = dp->d_namlen; if (namlen == length && !strcmp(name, dp->d_name)) { /* found entry */ *inumber_p = dp->d_ino; return (0); } next: dp = (struct direct *)((char *)dp + dp->d_reclen); } fp->f_seekp += buf_size; } return (ENOENT); } static int sblock_try[] = SBLOCKSEARCH; /* * Open a file. */ static int ufs_open(upath, f) const char *upath; struct open_file *f; { char *cp, *ncp; int c; ino_t inumber, parent_inumber; struct file *fp; struct fs *fs; int i, rc; size_t buf_size; int nlinks = 0; char namebuf[MAXPATHLEN+1]; char *buf = NULL; char *path = NULL; /* allocate file system specific data structure */ fp = malloc(sizeof(struct file)); bzero(fp, sizeof(struct file)); f->f_fsdata = (void *)fp; /* allocate space and read super block */ fs = malloc(SBLOCKSIZE); fp->f_fs = fs; twiddle(1); /* * Try reading the superblock in each of its possible locations. */ for (i = 0; sblock_try[i] != -1; i++) { rc = (f->f_dev->dv_strategy)(f->f_devdata, F_READ, sblock_try[i] / DEV_BSIZE, SBLOCKSIZE, (char *)fs, &buf_size); if (rc) goto out; if ((fs->fs_magic == FS_UFS1_MAGIC || (fs->fs_magic == FS_UFS2_MAGIC && fs->fs_sblockloc == sblock_try[i])) && buf_size == SBLOCKSIZE && fs->fs_bsize <= MAXBSIZE && fs->fs_bsize >= sizeof(struct fs)) break; } if (sblock_try[i] == -1) { rc = EINVAL; goto out; } /* * Calculate indirect block levels. */ { ufs2_daddr_t mult; int level; mult = 1; - for (level = 0; level < NIADDR; level++) { + for (level = 0; level < UFS_NIADDR; level++) { mult *= NINDIR(fs); fp->f_nindir[level] = mult; } } - inumber = ROOTINO; + inumber = UFS_ROOTINO; if ((rc = read_inode(inumber, f)) != 0) goto out; cp = path = strdup(upath); if (path == NULL) { rc = ENOMEM; goto out; } while (*cp) { /* * Remove extra separators */ while (*cp == '/') cp++; if (*cp == '\0') break; /* * Check that current node is a directory. */ if ((DIP(fp, di_mode) & IFMT) != IFDIR) { rc = ENOTDIR; goto out; } /* * Get next component of path name. */ { int len = 0; ncp = cp; while ((c = *cp) != '\0' && c != '/') { if (++len > UFS_MAXNAMLEN) { rc = ENOENT; goto out; } cp++; } *cp = '\0'; } /* * Look up component in current directory. * Save directory inumber in case we find a * symbolic link. */ parent_inumber = inumber; rc = search_directory(ncp, f, &inumber); *cp = c; if (rc) goto out; /* * Open next component. */ if ((rc = read_inode(inumber, f)) != 0) goto out; /* * Check for symbolic link. */ if ((DIP(fp, di_mode) & IFMT) == IFLNK) { int link_len = DIP(fp, di_size); int len; len = strlen(cp); if (link_len + len > MAXPATHLEN || ++nlinks > MAXSYMLINKS) { rc = ENOENT; goto out; } bcopy(cp, &namebuf[link_len], len + 1); if (link_len < fs->fs_maxsymlinklen) { if (fp->f_fs->fs_magic == FS_UFS1_MAGIC) cp = (caddr_t)(fp->f_di.di1.di_db); else cp = (caddr_t)(fp->f_di.di2.di_db); bcopy(cp, namebuf, (unsigned) link_len); } else { /* * Read file for symbolic link */ size_t buf_size; ufs2_daddr_t disk_block; struct fs *fs = fp->f_fs; if (!buf) buf = malloc(fs->fs_bsize); rc = block_map(f, (ufs2_daddr_t)0, &disk_block); if (rc) goto out; twiddle(1); rc = (f->f_dev->dv_strategy)(f->f_devdata, F_READ, fsbtodb(fs, disk_block), fs->fs_bsize, buf, &buf_size); if (rc) goto out; bcopy((char *)buf, namebuf, (unsigned)link_len); } /* * If relative pathname, restart at parent directory. * If absolute pathname, restart at root. */ cp = namebuf; if (*cp != '/') inumber = parent_inumber; else - inumber = (ino_t)ROOTINO; + inumber = (ino_t)UFS_ROOTINO; if ((rc = read_inode(inumber, f)) != 0) goto out; } } /* * Found terminal component. */ rc = 0; fp->f_seekp = 0; out: if (buf) free(buf); if (path) free(path); if (rc) { if (fp->f_buf) free(fp->f_buf); free(fp->f_fs); free(fp); } return (rc); } static int ufs_close(f) struct open_file *f; { struct file *fp = (struct file *)f->f_fsdata; int level; f->f_fsdata = (void *)0; if (fp == (struct file *)0) return (0); - for (level = 0; level < NIADDR; level++) { + for (level = 0; level < UFS_NIADDR; level++) { if (fp->f_blk[level]) free(fp->f_blk[level]); } if (fp->f_buf) free(fp->f_buf); free(fp->f_fs); free(fp); return (0); } /* * Copy a portion of a file into kernel memory. * Cross block boundaries when necessary. */ static int ufs_read(f, start, size, resid) struct open_file *f; void *start; size_t size; size_t *resid; /* out */ { struct file *fp = (struct file *)f->f_fsdata; size_t csize; char *buf; size_t buf_size; int rc = 0; char *addr = start; while (size != 0) { if (fp->f_seekp >= DIP(fp, di_size)) break; rc = buf_read_file(f, &buf, &buf_size); if (rc) break; csize = size; if (csize > buf_size) csize = buf_size; bcopy(buf, addr, csize); fp->f_seekp += csize; addr += csize; size -= csize; } if (resid) *resid = size; return (rc); } /* * Write to a portion of an already allocated file. * Cross block boundaries when necessary. Can not * extend the file. */ static int ufs_write(f, start, size, resid) struct open_file *f; void *start; size_t size; size_t *resid; /* out */ { struct file *fp = (struct file *)f->f_fsdata; size_t csize; int rc = 0; char *addr = start; csize = size; while ((size != 0) && (csize != 0)) { if (fp->f_seekp >= DIP(fp, di_size)) break; if (csize >= 512) csize = 512; /* XXX */ rc = buf_write_file(f, addr, &csize); if (rc) break; fp->f_seekp += csize; addr += csize; size -= csize; } if (resid) *resid = size; return (rc); } static off_t ufs_seek(f, offset, where) struct open_file *f; off_t offset; int where; { struct file *fp = (struct file *)f->f_fsdata; switch (where) { case SEEK_SET: fp->f_seekp = offset; break; case SEEK_CUR: fp->f_seekp += offset; break; case SEEK_END: fp->f_seekp = DIP(fp, di_size) - offset; break; default: errno = EINVAL; return (-1); } return (fp->f_seekp); } static int ufs_stat(f, sb) struct open_file *f; struct stat *sb; { struct file *fp = (struct file *)f->f_fsdata; /* only important stuff */ sb->st_mode = DIP(fp, di_mode); sb->st_uid = DIP(fp, di_uid); sb->st_gid = DIP(fp, di_gid); sb->st_size = DIP(fp, di_size); return (0); } static int ufs_readdir(struct open_file *f, struct dirent *d) { struct file *fp = (struct file *)f->f_fsdata; struct direct *dp; char *buf; size_t buf_size; int error; /* * assume that a directory entry will not be split across blocks */ again: if (fp->f_seekp >= DIP(fp, di_size)) return (ENOENT); error = buf_read_file(f, &buf, &buf_size); if (error) return (error); dp = (struct direct *)buf; fp->f_seekp += dp->d_reclen; if (dp->d_ino == (ino_t)0) goto again; d->d_type = dp->d_type; strcpy(d->d_name, dp->d_name); return (0); } Index: head/sbin/dump/traverse.c =================================================================== --- head/sbin/dump/traverse.c (revision 313779) +++ head/sbin/dump/traverse.c (revision 313780) @@ -1,1009 +1,1008 @@ /*- * Copyright (c) 1980, 1988, 1991, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #ifndef lint #if 0 static char sccsid[] = "@(#)traverse.c 8.7 (Berkeley) 6/15/95"; #endif static const char rcsid[] = "$FreeBSD$"; #endif /* not lint */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "dump.h" union dinode { struct ufs1_dinode dp1; struct ufs2_dinode dp2; }; #define DIP(dp, field) \ ((sblock->fs_magic == FS_UFS1_MAGIC) ? \ (dp)->dp1.field : (dp)->dp2.field) #define DIP_SET(dp, field, val) do {\ if (sblock->fs_magic == FS_UFS1_MAGIC) \ (dp)->dp1.field = (val); \ else \ (dp)->dp2.field = (val); \ } while (0) #define HASDUMPEDFILE 0x1 #define HASSUBDIRS 0x2 static int dirindir(ino_t ino, ufs2_daddr_t blkno, int level, long *size, long *tapesize, int nodump, ino_t maxino); static void dmpindir(union dinode *dp, ino_t ino, ufs2_daddr_t blk, int level, off_t *size); static void ufs1_blksout(ufs1_daddr_t *blkp, int frags, ino_t ino); static void ufs2_blksout(union dinode *dp, ufs2_daddr_t *blkp, int frags, ino_t ino, int last); static int appendextdata(union dinode *dp); static void writeextdata(union dinode *dp, ino_t ino, int added); static int searchdir(ino_t ino, ufs2_daddr_t blkno, long size, long filesize, long *tapesize, int nodump, ino_t maxino); static long blockest(union dinode *dp); /* * This is an estimation of the number of TP_BSIZE blocks in the file. * It estimates the number of blocks in files with holes by assuming * that all of the blocks accounted for by di_blocks are data blocks * (when some of the blocks are usually used for indirect pointers); * hence the estimate may be high. */ static long blockest(union dinode *dp) { long blkest, sizeest; /* * dp->di_size is the size of the file in bytes. * dp->di_blocks stores the number of sectors actually in the file. * If there are more sectors than the size would indicate, this just * means that there are indirect blocks in the file or unused * sectors in the last file block; we can safely ignore these * (blkest = sizeest below). * If the file is bigger than the number of sectors would indicate, * then the file has holes in it. In this case we must use the * block count to estimate the number of data blocks used, but * we use the actual size for estimating the number of indirect * dump blocks (sizeest vs. blkest in the indirect block * calculation). */ if ((DIP(dp, di_flags) & SF_SNAPSHOT) != 0) return (1); blkest = howmany(dbtob(DIP(dp, di_blocks)), TP_BSIZE); sizeest = howmany(DIP(dp, di_size), TP_BSIZE); if (blkest > sizeest) blkest = sizeest; - if (DIP(dp, di_size) > sblock->fs_bsize * NDADDR) { + if (DIP(dp, di_size) > sblock->fs_bsize * UFS_NDADDR) { /* calculate the number of indirect blocks on the dump tape */ - blkest += - howmany(sizeest - NDADDR * sblock->fs_bsize / TP_BSIZE, - TP_NINDIR); + blkest += howmany(sizeest - + UFS_NDADDR * sblock->fs_bsize / TP_BSIZE, TP_NINDIR); } return (blkest + 1); } /* Auxiliary macro to pick up files changed since previous dump. */ #define CHANGEDSINCE(dp, t) \ (DIP(dp, di_mtime) >= (t) || DIP(dp, di_ctime) >= (t)) /* The WANTTODUMP macro decides whether a file should be dumped. */ #ifdef UF_NODUMP #define WANTTODUMP(dp) \ (CHANGEDSINCE(dp, spcl.c_ddate) && \ (nonodump || (DIP(dp, di_flags) & UF_NODUMP) != UF_NODUMP)) #else #define WANTTODUMP(dp) CHANGEDSINCE(dp, spcl.c_ddate) #endif /* * Dump pass 1. * * Walk the inode list for a file system to find all allocated inodes * that have been modified since the previous dump time. Also, find all * the directories in the file system. */ int mapfiles(ino_t maxino, long *tapesize) { int i, cg, mode, inosused; int anydirskipped = 0; union dinode *dp; struct cg *cgp; ino_t ino; u_char *cp; if ((cgp = malloc(sblock->fs_cgsize)) == NULL) quit("mapfiles: cannot allocate memory.\n"); for (cg = 0; cg < sblock->fs_ncg; cg++) { ino = cg * sblock->fs_ipg; bread(fsbtodb(sblock, cgtod(sblock, cg)), (char *)cgp, sblock->fs_cgsize); if (sblock->fs_magic == FS_UFS2_MAGIC) inosused = cgp->cg_initediblk; else inosused = sblock->fs_ipg; /* * If we are using soft updates, then we can trust the * cylinder group inode allocation maps to tell us which * inodes are allocated. We will scan the used inode map * to find the inodes that are really in use, and then * read only those inodes in from disk. */ if (sblock->fs_flags & FS_DOSOFTDEP) { if (!cg_chkmagic(cgp)) quit("mapfiles: cg %d: bad magic number\n", cg); cp = &cg_inosused(cgp)[(inosused - 1) / CHAR_BIT]; for ( ; inosused > 0; inosused -= CHAR_BIT, cp--) { if (*cp == 0) continue; for (i = 1 << (CHAR_BIT - 1); i > 0; i >>= 1) { if (*cp & i) break; inosused--; } break; } if (inosused <= 0) continue; } for (i = 0; i < inosused; i++, ino++) { - if (ino < ROOTINO || + if (ino < UFS_ROOTINO || (dp = getino(ino, &mode)) == NULL || (mode & IFMT) == 0) continue; if (ino >= maxino) { msg("Skipping inode %ju >= maxino %ju\n", (uintmax_t)ino, (uintmax_t)maxino); continue; } /* * Everything must go in usedinomap so that a check * for "in dumpdirmap but not in usedinomap" to detect * dirs with nodump set has a chance of succeeding * (this is used in mapdirs()). */ SETINO(ino, usedinomap); if (mode == IFDIR) SETINO(ino, dumpdirmap); if (WANTTODUMP(dp)) { SETINO(ino, dumpinomap); if (mode != IFREG && mode != IFDIR && mode != IFLNK) *tapesize += 1; else *tapesize += blockest(dp); continue; } if (mode == IFDIR) { if (!nonodump && (DIP(dp, di_flags) & UF_NODUMP)) CLRINO(ino, usedinomap); anydirskipped = 1; } } } /* * Restore gets very upset if the root is not dumped, * so ensure that it always is dumped. */ - SETINO(ROOTINO, dumpinomap); + SETINO(UFS_ROOTINO, dumpinomap); return (anydirskipped); } /* * Dump pass 2. * * Scan each directory on the file system to see if it has any modified * files in it. If it does, and has not already been added to the dump * list (because it was itself modified), then add it. If a directory * has not been modified itself, contains no modified files and has no * subdirectories, then it can be deleted from the dump list and from * the list of directories. By deleting it from the list of directories, * its parent may now qualify for the same treatment on this or a later * pass using this algorithm. */ int mapdirs(ino_t maxino, long *tapesize) { union dinode *dp; int i, isdir, nodump; char *map; ino_t ino; union dinode di; long filesize; int ret, change = 0; isdir = 0; /* XXX just to get gcc to shut up */ for (map = dumpdirmap, ino = 1; ino < maxino; ino++) { if (((ino - 1) % CHAR_BIT) == 0) /* map is offset by 1 */ isdir = *map++; else isdir >>= 1; /* * If a directory has been removed from usedinomap, it * either has the nodump flag set, or has inherited * it. Although a directory can't be in dumpinomap if * it isn't in usedinomap, we have to go through it to * propagate the nodump flag. */ nodump = !nonodump && (TSTINO(ino, usedinomap) == 0); if ((isdir & 1) == 0 || (TSTINO(ino, dumpinomap) && !nodump)) continue; dp = getino(ino, &i); /* * inode buf may change in searchdir(). */ if (sblock->fs_magic == FS_UFS1_MAGIC) di.dp1 = dp->dp1; else di.dp2 = dp->dp2; filesize = DIP(&di, di_size); - for (ret = 0, i = 0; filesize > 0 && i < NDADDR; i++) { + for (ret = 0, i = 0; filesize > 0 && i < UFS_NDADDR; i++) { if (DIP(&di, di_db[i]) != 0) ret |= searchdir(ino, DIP(&di, di_db[i]), (long)sblksize(sblock, DIP(&di, di_size), i), filesize, tapesize, nodump, maxino); if (ret & HASDUMPEDFILE) filesize = 0; else filesize -= sblock->fs_bsize; } - for (i = 0; filesize > 0 && i < NIADDR; i++) { + for (i = 0; filesize > 0 && i < UFS_NIADDR; i++) { if (DIP(&di, di_ib[i]) == 0) continue; ret |= dirindir(ino, DIP(&di, di_ib[i]), i, &filesize, tapesize, nodump, maxino); } if (ret & HASDUMPEDFILE) { SETINO(ino, dumpinomap); *tapesize += blockest(&di); change = 1; continue; } if (nodump) { if (ret & HASSUBDIRS) change = 1; /* subdirs inherit nodump */ CLRINO(ino, dumpdirmap); } else if ((ret & HASSUBDIRS) == 0) if (!TSTINO(ino, dumpinomap)) { CLRINO(ino, dumpdirmap); change = 1; } } return (change); } /* * Read indirect blocks, and pass the data blocks to be searched * as directories. Quit as soon as any entry is found that will * require the directory to be dumped. */ static int dirindir( ino_t ino, ufs2_daddr_t blkno, int ind_level, long *filesize, long *tapesize, int nodump, ino_t maxino) { union { ufs1_daddr_t ufs1[MAXBSIZE / sizeof(ufs1_daddr_t)]; ufs2_daddr_t ufs2[MAXBSIZE / sizeof(ufs2_daddr_t)]; } idblk; int ret = 0; int i; bread(fsbtodb(sblock, blkno), (char *)&idblk, (int)sblock->fs_bsize); if (ind_level <= 0) { for (i = 0; *filesize > 0 && i < NINDIR(sblock); i++) { if (sblock->fs_magic == FS_UFS1_MAGIC) blkno = idblk.ufs1[i]; else blkno = idblk.ufs2[i]; if (blkno != 0) ret |= searchdir(ino, blkno, sblock->fs_bsize, *filesize, tapesize, nodump, maxino); if (ret & HASDUMPEDFILE) *filesize = 0; else *filesize -= sblock->fs_bsize; } return (ret); } ind_level--; for (i = 0; *filesize > 0 && i < NINDIR(sblock); i++) { if (sblock->fs_magic == FS_UFS1_MAGIC) blkno = idblk.ufs1[i]; else blkno = idblk.ufs2[i]; if (blkno != 0) ret |= dirindir(ino, blkno, ind_level, filesize, tapesize, nodump, maxino); } return (ret); } /* * Scan a disk block containing directory information looking to see if * any of the entries are on the dump list and to see if the directory * contains any subdirectories. */ static int searchdir( ino_t ino, ufs2_daddr_t blkno, long size, long filesize, long *tapesize, int nodump, ino_t maxino) { int mode; struct direct *dp; union dinode *ip; long loc, ret = 0; static caddr_t dblk; if (dblk == NULL && (dblk = malloc(sblock->fs_bsize)) == NULL) quit("searchdir: cannot allocate indirect memory.\n"); bread(fsbtodb(sblock, blkno), dblk, (int)size); if (filesize < size) size = filesize; for (loc = 0; loc < size; ) { dp = (struct direct *)(dblk + loc); if (dp->d_reclen == 0) { msg("corrupted directory, inumber %ju\n", (uintmax_t)ino); break; } loc += dp->d_reclen; if (dp->d_ino == 0) continue; if (dp->d_ino >= maxino) { msg("corrupted directory entry, d_ino %ju >= %ju\n", (uintmax_t)dp->d_ino, (uintmax_t)maxino); break; } if (dp->d_name[0] == '.') { if (dp->d_name[1] == '\0') continue; if (dp->d_name[1] == '.' && dp->d_name[2] == '\0') continue; } if (nodump) { ip = getino(dp->d_ino, &mode); if (TSTINO(dp->d_ino, dumpinomap)) { CLRINO(dp->d_ino, dumpinomap); *tapesize -= blockest(ip); } /* * Add back to dumpdirmap and remove from usedinomap * to propagate nodump. */ if (mode == IFDIR) { SETINO(dp->d_ino, dumpdirmap); CLRINO(dp->d_ino, usedinomap); ret |= HASSUBDIRS; } } else { if (TSTINO(dp->d_ino, dumpinomap)) { ret |= HASDUMPEDFILE; if (ret & HASSUBDIRS) break; } if (TSTINO(dp->d_ino, dumpdirmap)) { ret |= HASSUBDIRS; if (ret & HASDUMPEDFILE) break; } } } return (ret); } /* * Dump passes 3 and 4. * * Dump the contents of an inode to tape. */ void dumpino(union dinode *dp, ino_t ino) { int ind_level, cnt, last, added; off_t size; char buf[TP_BSIZE]; if (newtape) { newtape = 0; dumpmap(dumpinomap, TS_BITS, ino); } CLRINO(ino, dumpinomap); /* * Zero out the size of a snapshot so that it will be dumped * as a zero length file. */ if ((DIP(dp, di_flags) & SF_SNAPSHOT) != 0) { DIP_SET(dp, di_size, 0); DIP_SET(dp, di_flags, DIP(dp, di_flags) & ~SF_SNAPSHOT); } if (sblock->fs_magic == FS_UFS1_MAGIC) { spcl.c_mode = dp->dp1.di_mode; spcl.c_size = dp->dp1.di_size; spcl.c_extsize = 0; spcl.c_atime = _time32_to_time(dp->dp1.di_atime); spcl.c_atimensec = dp->dp1.di_atimensec; spcl.c_mtime = _time32_to_time(dp->dp1.di_mtime); spcl.c_mtimensec = dp->dp1.di_mtimensec; spcl.c_birthtime = 0; spcl.c_birthtimensec = 0; spcl.c_rdev = dp->dp1.di_rdev; spcl.c_file_flags = dp->dp1.di_flags; spcl.c_uid = dp->dp1.di_uid; spcl.c_gid = dp->dp1.di_gid; } else { spcl.c_mode = dp->dp2.di_mode; spcl.c_size = dp->dp2.di_size; spcl.c_extsize = dp->dp2.di_extsize; spcl.c_atime = _time64_to_time(dp->dp2.di_atime); spcl.c_atimensec = dp->dp2.di_atimensec; spcl.c_mtime = _time64_to_time(dp->dp2.di_mtime); spcl.c_mtimensec = dp->dp2.di_mtimensec; spcl.c_birthtime = _time64_to_time(dp->dp2.di_birthtime); spcl.c_birthtimensec = dp->dp2.di_birthnsec; spcl.c_rdev = dp->dp2.di_rdev; spcl.c_file_flags = dp->dp2.di_flags; spcl.c_uid = dp->dp2.di_uid; spcl.c_gid = dp->dp2.di_gid; } spcl.c_type = TS_INODE; spcl.c_count = 0; switch (DIP(dp, di_mode) & S_IFMT) { case 0: /* * Freed inode. */ return; case S_IFLNK: /* * Check for short symbolic link. */ if (DIP(dp, di_size) > 0 && DIP(dp, di_size) < sblock->fs_maxsymlinklen) { spcl.c_addr[0] = 1; spcl.c_count = 1; added = appendextdata(dp); writeheader(ino); if (sblock->fs_magic == FS_UFS1_MAGIC) memmove(buf, (caddr_t)dp->dp1.di_db, (u_long)DIP(dp, di_size)); else memmove(buf, (caddr_t)dp->dp2.di_db, (u_long)DIP(dp, di_size)); buf[DIP(dp, di_size)] = '\0'; writerec(buf, 0); writeextdata(dp, ino, added); return; } /* FALLTHROUGH */ case S_IFDIR: case S_IFREG: if (DIP(dp, di_size) > 0) break; /* FALLTHROUGH */ case S_IFIFO: case S_IFSOCK: case S_IFCHR: case S_IFBLK: added = appendextdata(dp); writeheader(ino); writeextdata(dp, ino, added); return; default: msg("Warning: undefined file type 0%o\n", DIP(dp, di_mode) & IFMT); return; } - if (DIP(dp, di_size) > NDADDR * sblock->fs_bsize) { - cnt = NDADDR * sblock->fs_frag; + if (DIP(dp, di_size) > UFS_NDADDR * sblock->fs_bsize) { + cnt = UFS_NDADDR * sblock->fs_frag; last = 0; } else { cnt = howmany(DIP(dp, di_size), sblock->fs_fsize); last = 1; } if (sblock->fs_magic == FS_UFS1_MAGIC) ufs1_blksout(&dp->dp1.di_db[0], cnt, ino); else ufs2_blksout(dp, &dp->dp2.di_db[0], cnt, ino, last); - if ((size = DIP(dp, di_size) - NDADDR * sblock->fs_bsize) <= 0) + if ((size = DIP(dp, di_size) - UFS_NDADDR * sblock->fs_bsize) <= 0) return; - for (ind_level = 0; ind_level < NIADDR; ind_level++) { + for (ind_level = 0; ind_level < UFS_NIADDR; ind_level++) { dmpindir(dp, ino, DIP(dp, di_ib[ind_level]), ind_level, &size); if (size <= 0) return; } } /* * Read indirect blocks, and pass the data blocks to be dumped. */ static void dmpindir(union dinode *dp, ino_t ino, ufs2_daddr_t blk, int ind_level, off_t *size) { union { ufs1_daddr_t ufs1[MAXBSIZE / sizeof(ufs1_daddr_t)]; ufs2_daddr_t ufs2[MAXBSIZE / sizeof(ufs2_daddr_t)]; } idblk; int i, cnt, last; if (blk != 0) bread(fsbtodb(sblock, blk), (char *)&idblk, (int)sblock->fs_bsize); else memset(&idblk, 0, sblock->fs_bsize); if (ind_level <= 0) { if (*size > NINDIR(sblock) * sblock->fs_bsize) { cnt = NINDIR(sblock) * sblock->fs_frag; last = 0; } else { cnt = howmany(*size, sblock->fs_fsize); last = 1; } *size -= NINDIR(sblock) * sblock->fs_bsize; if (sblock->fs_magic == FS_UFS1_MAGIC) ufs1_blksout(idblk.ufs1, cnt, ino); else ufs2_blksout(dp, idblk.ufs2, cnt, ino, last); return; } ind_level--; for (i = 0; i < NINDIR(sblock); i++) { if (sblock->fs_magic == FS_UFS1_MAGIC) dmpindir(dp, ino, idblk.ufs1[i], ind_level, size); else dmpindir(dp, ino, idblk.ufs2[i], ind_level, size); if (*size <= 0) return; } } /* * Collect up the data into tape record sized buffers and output them. */ static void ufs1_blksout(ufs1_daddr_t *blkp, int frags, ino_t ino) { ufs1_daddr_t *bp; int i, j, count, blks, tbperdb; blks = howmany(frags * sblock->fs_fsize, TP_BSIZE); tbperdb = sblock->fs_bsize >> tp_bshift; for (i = 0; i < blks; i += TP_NINDIR) { if (i + TP_NINDIR > blks) count = blks; else count = i + TP_NINDIR; for (j = i; j < count; j++) if (blkp[j / tbperdb] != 0) spcl.c_addr[j - i] = 1; else spcl.c_addr[j - i] = 0; spcl.c_count = count - i; writeheader(ino); bp = &blkp[i / tbperdb]; for (j = i; j < count; j += tbperdb, bp++) if (*bp != 0) { if (j + tbperdb <= count) dumpblock(*bp, (int)sblock->fs_bsize); else dumpblock(*bp, (count - j) * TP_BSIZE); } spcl.c_type = TS_ADDR; } } /* * Collect up the data into tape record sized buffers and output them. */ static void ufs2_blksout(union dinode *dp, ufs2_daddr_t *blkp, int frags, ino_t ino, int last) { ufs2_daddr_t *bp; int i, j, count, resid, blks, tbperdb, added; static int writingextdata = 0; /* * Calculate the number of TP_BSIZE blocks to be dumped. * For filesystems with a fragment size bigger than TP_BSIZE, * only part of the final fragment may need to be dumped. */ blks = howmany(frags * sblock->fs_fsize, TP_BSIZE); if (last) { if (writingextdata) resid = howmany(fragoff(sblock, spcl.c_extsize), TP_BSIZE); else resid = howmany(fragoff(sblock, dp->dp2.di_size), TP_BSIZE); if (resid > 0) blks -= howmany(sblock->fs_fsize, TP_BSIZE) - resid; } tbperdb = sblock->fs_bsize >> tp_bshift; for (i = 0; i < blks; i += TP_NINDIR) { if (i + TP_NINDIR > blks) count = blks; else count = i + TP_NINDIR; for (j = i; j < count; j++) if (blkp[j / tbperdb] != 0) spcl.c_addr[j - i] = 1; else spcl.c_addr[j - i] = 0; spcl.c_count = count - i; if (last && count == blks && !writingextdata) added = appendextdata(dp); writeheader(ino); bp = &blkp[i / tbperdb]; for (j = i; j < count; j += tbperdb, bp++) if (*bp != 0) { if (j + tbperdb <= count) dumpblock(*bp, (int)sblock->fs_bsize); else dumpblock(*bp, (count - j) * TP_BSIZE); } spcl.c_type = TS_ADDR; spcl.c_count = 0; if (last && count == blks && !writingextdata) { writingextdata = 1; writeextdata(dp, ino, added); writingextdata = 0; } } } /* * If there is room in the current block for the extended attributes * as well as the file data, update the header to reflect the added * attribute data at the end. Attributes are placed at the end so that * old versions of restore will correctly restore the file and simply * discard the extra data at the end that it does not understand. * The attribute data is dumped following the file data by the * writeextdata() function (below). */ static int appendextdata(union dinode *dp) { int i, blks, tbperdb; /* * If no extended attributes, there is nothing to do. */ if (spcl.c_extsize == 0) return (0); /* * If there is not enough room at the end of this block * to add the extended attributes, then rather than putting * part of them here, we simply push them entirely into a * new block rather than putting some here and some later. */ - if (spcl.c_extsize > NXADDR * sblock->fs_bsize) - blks = howmany(NXADDR * sblock->fs_bsize, TP_BSIZE); + if (spcl.c_extsize > UFS_NXADDR * sblock->fs_bsize) + blks = howmany(UFS_NXADDR * sblock->fs_bsize, TP_BSIZE); else blks = howmany(spcl.c_extsize, TP_BSIZE); if (spcl.c_count + blks > TP_NINDIR) return (0); /* * Update the block map in the header to indicate the added * extended attribute. They will be appended after the file * data by the writeextdata() routine. */ tbperdb = sblock->fs_bsize >> tp_bshift; for (i = 0; i < blks; i++) if (&dp->dp2.di_extb[i / tbperdb] != 0) spcl.c_addr[spcl.c_count + i] = 1; else spcl.c_addr[spcl.c_count + i] = 0; spcl.c_count += blks; return (blks); } /* * Dump the extended attribute data. If there was room in the file * header, then all we need to do is output the data blocks. If there * was not room in the file header, then an additional TS_ADDR header * is created to hold the attribute data. */ static void writeextdata(union dinode *dp, ino_t ino, int added) { int i, frags, blks, tbperdb, last; ufs2_daddr_t *bp; off_t size; /* * If no extended attributes, there is nothing to do. */ if (spcl.c_extsize == 0) return; /* * If there was no room in the file block for the attributes, * dump them out in a new block, otherwise just dump the data. */ if (added == 0) { - if (spcl.c_extsize > NXADDR * sblock->fs_bsize) { - frags = NXADDR * sblock->fs_frag; + if (spcl.c_extsize > UFS_NXADDR * sblock->fs_bsize) { + frags = UFS_NXADDR * sblock->fs_frag; last = 0; } else { frags = howmany(spcl.c_extsize, sblock->fs_fsize); last = 1; } ufs2_blksout(dp, &dp->dp2.di_extb[0], frags, ino, last); } else { - if (spcl.c_extsize > NXADDR * sblock->fs_bsize) - blks = howmany(NXADDR * sblock->fs_bsize, TP_BSIZE); + if (spcl.c_extsize > UFS_NXADDR * sblock->fs_bsize) + blks = howmany(UFS_NXADDR * sblock->fs_bsize, TP_BSIZE); else blks = howmany(spcl.c_extsize, TP_BSIZE); tbperdb = sblock->fs_bsize >> tp_bshift; for (i = 0; i < blks; i += tbperdb) { bp = &dp->dp2.di_extb[i / tbperdb]; if (*bp != 0) { if (i + tbperdb <= blks) dumpblock(*bp, (int)sblock->fs_bsize); else dumpblock(*bp, (blks - i) * TP_BSIZE); } } } /* * If an indirect block is added for extended attributes, then * di_exti below should be changed to the structure element * that references the extended attribute indirect block. This * definition is here only to make it compile without complaint. */ #define di_exti di_spare[0] /* * If the extended attributes fall into an indirect block, * dump it as well. */ - if ((size = spcl.c_extsize - NXADDR * sblock->fs_bsize) > 0) + if ((size = spcl.c_extsize - UFS_NXADDR * sblock->fs_bsize) > 0) dmpindir(dp, ino, dp->dp2.di_exti, 0, &size); } /* * Dump a map to the tape. */ void dumpmap(char *map, int type, ino_t ino) { int i; char *cp; spcl.c_type = type; spcl.c_count = howmany(mapsize * sizeof(char), TP_BSIZE); writeheader(ino); for (i = 0, cp = map; i < spcl.c_count; i++, cp += TP_BSIZE) writerec(cp, 0); } /* * Write a header record to the dump tape. */ void writeheader(ino_t ino) { int32_t sum, cnt, *lp; if (rsync_friendly >= 2) { /* don't track changes to access time */ spcl.c_atime = spcl.c_mtime; spcl.c_atimensec = spcl.c_mtimensec; } spcl.c_inumber = ino; spcl.c_magic = FS_UFS2_MAGIC; spcl.c_checksum = 0; lp = (int32_t *)&spcl; sum = 0; cnt = sizeof(union u_spcl) / (4 * sizeof(int32_t)); while (--cnt >= 0) { sum += *lp++; sum += *lp++; sum += *lp++; sum += *lp++; } spcl.c_checksum = CHECKSUM - sum; writerec((char *)&spcl, 1); } union dinode * getino(ino_t inum, int *modep) { static ino_t minino, maxino; static caddr_t inoblock; struct ufs1_dinode *dp1; struct ufs2_dinode *dp2; if (inoblock == NULL && (inoblock = malloc(sblock->fs_bsize)) == NULL) quit("cannot allocate inode memory.\n"); curino = inum; if (inum >= minino && inum < maxino) goto gotit; bread(fsbtodb(sblock, ino_to_fsba(sblock, inum)), inoblock, (int)sblock->fs_bsize); minino = inum - (inum % INOPB(sblock)); maxino = minino + INOPB(sblock); gotit: if (sblock->fs_magic == FS_UFS1_MAGIC) { dp1 = &((struct ufs1_dinode *)inoblock)[inum - minino]; *modep = (dp1->di_mode & IFMT); return ((union dinode *)dp1); } dp2 = &((struct ufs2_dinode *)inoblock)[inum - minino]; *modep = (dp2->di_mode & IFMT); return ((union dinode *)dp2); } /* * Read a chunk of data from the disk. * Try to recover from hard errors by reading in sector sized pieces. * Error recovery is attempted at most BREADEMAX times before seeking * consent from the operator to continue. */ int breaderrors = 0; #define BREADEMAX 32 void bread(ufs2_daddr_t blkno, char *buf, int size) { int secsize, bytes, resid, xfer, base, cnt, i; static char *tmpbuf; off_t offset; loop: offset = blkno << dev_bshift; secsize = sblock->fs_fsize; base = offset % secsize; resid = size % secsize; /* * If the transfer request starts or ends on a non-sector * boundary, we must read the entire sector and copy out * just the part that we need. */ if (base == 0 && resid == 0) { cnt = cread(diskfd, buf, size, offset); if (cnt == size) return; } else { if (tmpbuf == NULL && (tmpbuf = malloc(secsize)) == NULL) quit("buffer malloc failed\n"); xfer = 0; bytes = size; if (base != 0) { cnt = cread(diskfd, tmpbuf, secsize, offset - base); if (cnt != secsize) goto bad; xfer = MIN(secsize - base, size); offset += xfer; bytes -= xfer; resid = bytes % secsize; memcpy(buf, &tmpbuf[base], xfer); } if (bytes >= secsize) { cnt = cread(diskfd, &buf[xfer], bytes - resid, offset); if (cnt != bytes - resid) goto bad; xfer += cnt; offset += cnt; } if (resid == 0) return; cnt = cread(diskfd, tmpbuf, secsize, offset); if (cnt == secsize) { memcpy(&buf[xfer], tmpbuf, resid); return; } } bad: if (blkno + (size / dev_bsize) > fsbtodb(sblock, sblock->fs_size)) { /* * Trying to read the final fragment. * * NB - dump only works in TP_BSIZE blocks, hence * rounds `dev_bsize' fragments up to TP_BSIZE pieces. * It should be smarter about not actually trying to * read more than it can get, but for the time being * we punt and scale back the read only when it gets * us into trouble. (mkm 9/25/83) */ size -= dev_bsize; goto loop; } if (cnt == -1) msg("read error from %s: %s: [block %jd]: count=%d\n", disk, strerror(errno), (intmax_t)blkno, size); else msg("short read error from %s: [block %jd]: count=%d, got=%d\n", disk, (intmax_t)blkno, size, cnt); if (++breaderrors > BREADEMAX) { msg("More than %d block read errors from %s\n", BREADEMAX, disk); broadcast("DUMP IS AILING!\n"); msg("This is an unrecoverable error.\n"); if (!query("Do you want to attempt to continue?")){ dumpabort(0); /*NOTREACHED*/ } else breaderrors = 0; } /* * Zero buffer, then try to read each sector of buffer separately, * and bypass the cache. */ memset(buf, 0, size); for (i = 0; i < size; i += dev_bsize, buf += dev_bsize, blkno++) { if ((cnt = pread(diskfd, buf, (int)dev_bsize, ((off_t)blkno << dev_bshift))) == dev_bsize) continue; if (cnt == -1) { msg("read error from %s: %s: [sector %jd]: count=%ld\n", disk, strerror(errno), (intmax_t)blkno, dev_bsize); continue; } msg("short read from %s: [sector %jd]: count=%ld, got=%d\n", disk, (intmax_t)blkno, dev_bsize, cnt); } } Index: head/sbin/ffsinfo/ffsinfo.c =================================================================== --- head/sbin/ffsinfo/ffsinfo.c (revision 313779) +++ head/sbin/ffsinfo/ffsinfo.c (revision 313780) @@ -1,646 +1,646 @@ /* * Copyright (c) 2000 Christoph Herrmann, Thomas-Henning von Kamptz * Copyright (c) 1980, 1989, 1993 The Regents of the University of California. * All rights reserved. * * This code is derived from software contributed to Berkeley by * Christoph Herrmann and Thomas-Henning von Kamptz, Munich and Frankfurt. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgment: * This product includes software developed by the University of * California, Berkeley and its contributors, as well as Christoph * Herrmann and Thomas-Henning von Kamptz. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $TSHeader: src/sbin/ffsinfo/ffsinfo.c,v 1.4 2000/12/12 19:30:55 tomsoft Exp $ * */ #ifndef lint static const char copyright[] = "@(#) Copyright (c) 2000 Christoph Herrmann, Thomas-Henning von Kamptz\n\ Copyright (c) 1980, 1989, 1993 The Regents of the University of California.\n\ All rights reserved.\n"; #endif /* not lint */ #ifndef lint static const char rcsid[] = "$FreeBSD$"; #endif /* not lint */ /* ********************************************************** INCLUDES ***** */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "debug.h" /* *********************************************************** GLOBALS ***** */ #ifdef FS_DEBUG int _dbg_lvl_ = (DL_INFO); /* DL_TRC */ #endif /* FS_DEBUG */ static struct uufsd disk; #define sblock disk.d_fs #define acg disk.d_cg static union { struct fs fs; char pad[SBLOCKSIZE]; } fsun; #define osblock fsun.fs static char i1blk[MAXBSIZE]; static char i2blk[MAXBSIZE]; static char i3blk[MAXBSIZE]; static struct csum *fscs; /* ******************************************************** PROTOTYPES ***** */ static void usage(void); static void dump_whole_ufs1_inode(ino_t, int); static void dump_whole_ufs2_inode(ino_t, int); #define DUMP_WHOLE_INODE(A,B) \ ( disk.d_ufs == 1 \ ? dump_whole_ufs1_inode((A),(B)) : dump_whole_ufs2_inode((A),(B)) ) /* ************************************************************** main ***** */ /* * ffsinfo(8) is a tool to dump all metadata of a file system. It helps to find * errors is the file system much easier. You can run ffsinfo before and after * an fsck(8), and compare the two ascii dumps easy with diff, and you see * directly where the problem is. You can control how much detail you want to * see with some command line arguments. You can also easy check the status * of a file system, like is there is enough space for growing a file system, * or how many active snapshots do we have. It provides much more detailed * information then dumpfs. Snapshots, as they are very new, are not really * supported. They are just mentioned currently, but it is planned to run * also over active snapshots, to even get that output. */ int main(int argc, char **argv) { DBG_FUNC("main") char *device, *special; int ch; size_t len; struct stat st; struct csum *dbg_csp; int dbg_csc; char dbg_line[80]; int cylno,i; int cfg_cg, cfg_in, cfg_lv; int cg_start, cg_stop; ino_t in; char *out_file; DBG_ENTER; cfg_lv = 0xff; cfg_in = -2; cfg_cg = -2; out_file = strdup("-"); while ((ch = getopt(argc, argv, "g:i:l:o:")) != -1) { switch (ch) { case 'g': cfg_cg = strtol(optarg, NULL, 0); if (errno == EINVAL || errno == ERANGE) err(1, "%s", optarg); if (cfg_cg < -1) usage(); break; case 'i': cfg_in = strtol(optarg, NULL, 0); if (errno == EINVAL || errno == ERANGE) err(1, "%s", optarg); if (cfg_in < 0) usage(); break; case 'l': cfg_lv = strtol(optarg, NULL, 0); if (errno == EINVAL||errno == ERANGE) err(1, "%s", optarg); if (cfg_lv < 0x1 || cfg_lv > 0x3ff) usage(); break; case 'o': free(out_file); out_file = strdup(optarg); if (out_file == NULL) errx(1, "strdup failed"); break; case '?': /* FALLTHROUGH */ default: usage(); } } argc -= optind; argv += optind; if (argc != 1) usage(); device = *argv; /* * Now we try to guess the (raw)device name. */ if (0 == strrchr(device, '/') && stat(device, &st) == -1) { /*- * No path prefix was given, so try in this order: * /dev/r%s * /dev/%s * /dev/vinum/r%s * /dev/vinum/%s. * * FreeBSD now doesn't distinguish between raw and block * devices any longer, but it should still work this way. */ len = strlen(device) + strlen(_PATH_DEV) + 2 + strlen("vinum/"); special = (char *)malloc(len); if (special == NULL) errx(1, "malloc failed"); snprintf(special, len, "%sr%s", _PATH_DEV, device); if (stat(special, &st) == -1) { snprintf(special, len, "%s%s", _PATH_DEV, device); if (stat(special, &st) == -1) { snprintf(special, len, "%svinum/r%s", _PATH_DEV, device); if (stat(special, &st) == -1) /* For now this is the 'last resort' */ snprintf(special, len, "%svinum/%s", _PATH_DEV, device); } } device = special; } if (ufs_disk_fillout(&disk, device) == -1) err(1, "ufs_disk_fillout(%s) failed: %s", device, disk.d_error); DBG_OPEN(out_file); /* already here we need a superblock */ if (cfg_lv & 0x001) DBG_DUMP_FS(&sblock, "primary sblock"); /* Determine here what cylinder groups to dump */ if (cfg_cg==-2) { cg_start = 0; cg_stop = sblock.fs_ncg; } else if (cfg_cg == -1) { cg_start = sblock.fs_ncg - 1; cg_stop = sblock.fs_ncg; } else if (cfg_cg < sblock.fs_ncg) { cg_start = cfg_cg; cg_stop = cfg_cg + 1; } else { cg_start = sblock.fs_ncg; cg_stop = sblock.fs_ncg; } if (cfg_lv & 0x004) { fscs = (struct csum *)calloc((size_t)1, (size_t)sblock.fs_cssize); if (fscs == NULL) errx(1, "calloc failed"); /* get the cylinder summary into the memory ... */ for (i = 0; i < sblock.fs_cssize; i += sblock.fs_bsize) { if (bread(&disk, fsbtodb(&sblock, sblock.fs_csaddr + numfrags(&sblock, i)), (void *)(((char *)fscs)+i), (size_t)(sblock.fs_cssize-i < sblock.fs_bsize ? sblock.fs_cssize - i : sblock.fs_bsize)) == -1) err(1, "bread: %s", disk.d_error); } dbg_csp = fscs; /* ... and dump it */ for(dbg_csc=0; dbg_cscdi_nlink==0) { DBG_LEAVE; return; /* inode not in use */ } /* * Dump the main inode structure. */ snprintf(comment, sizeof(comment), "Inode 0x%08jx", (uintmax_t)inode); if (level & 0x100) { DBG_DUMP_INO(&sblock, comment, ino); } if (!(level & 0x200)) { DBG_LEAVE; return; } /* * Ok, now prepare for dumping all direct and indirect pointers. */ - rb=howmany(ino->di_size, sblock.fs_bsize)-NDADDR; + rb = howmany(ino->di_size, sblock.fs_bsize) - UFS_NDADDR; if(rb>0) { /* * Dump single indirect block. */ if (bread(&disk, fsbtodb(&sblock, ino->di_ib[0]), (void *)&i1blk, (size_t)sblock.fs_bsize) == -1) { err(1, "bread: %s", disk.d_error); } snprintf(comment, sizeof(comment), "Inode 0x%08jx: indirect 0", (uintmax_t)inode); DBG_DUMP_IBLK(&sblock, comment, i1blk, (size_t)rb); rb-=howmany(sblock.fs_bsize, sizeof(ufs1_daddr_t)); } if(rb>0) { /* * Dump double indirect blocks. */ if (bread(&disk, fsbtodb(&sblock, ino->di_ib[1]), (void *)&i2blk, (size_t)sblock.fs_bsize) == -1) { err(1, "bread: %s", disk.d_error); } snprintf(comment, sizeof(comment), "Inode 0x%08jx: indirect 1", (uintmax_t)inode); DBG_DUMP_IBLK(&sblock, comment, i2blk, howmany(rb, howmany(sblock.fs_bsize, sizeof(ufs1_daddr_t)))); for(ind2ctr=0; ((ind2ctr < howmany(sblock.fs_bsize, sizeof(ufs1_daddr_t))) && (rb>0)); ind2ctr++) { ind2ptr=&((ufs1_daddr_t *)(void *)&i2blk)[ind2ctr]; if (bread(&disk, fsbtodb(&sblock, *ind2ptr), (void *)&i1blk, (size_t)sblock.fs_bsize) == -1) { err(1, "bread: %s", disk.d_error); } snprintf(comment, sizeof(comment), "Inode 0x%08jx: indirect 1->%d", (uintmax_t)inode, ind2ctr); DBG_DUMP_IBLK(&sblock, comment, i1blk, (size_t)rb); rb-=howmany(sblock.fs_bsize, sizeof(ufs1_daddr_t)); } } if(rb>0) { /* * Dump triple indirect blocks. */ if (bread(&disk, fsbtodb(&sblock, ino->di_ib[2]), (void *)&i3blk, (size_t)sblock.fs_bsize) == -1) { err(1, "bread: %s", disk.d_error); } snprintf(comment, sizeof(comment), "Inode 0x%08jx: indirect 2", (uintmax_t)inode); #define SQUARE(a) ((a)*(a)) DBG_DUMP_IBLK(&sblock, comment, i3blk, howmany(rb, SQUARE(howmany(sblock.fs_bsize, sizeof(ufs1_daddr_t))))); #undef SQUARE for(ind3ctr=0; ((ind3ctr0)); ind3ctr++) { ind3ptr=&((ufs1_daddr_t *)(void *)&i3blk)[ind3ctr]; if (bread(&disk, fsbtodb(&sblock, *ind3ptr), (void *)&i2blk, (size_t)sblock.fs_bsize) == -1) { err(1, "bread: %s", disk.d_error); } snprintf(comment, sizeof(comment), "Inode 0x%08jx: indirect 2->%d", (uintmax_t)inode, ind3ctr); DBG_DUMP_IBLK(&sblock, comment, i2blk, howmany(rb, howmany(sblock.fs_bsize, sizeof(ufs1_daddr_t)))); for(ind2ctr=0; ((ind2ctr < howmany(sblock.fs_bsize, sizeof(ufs1_daddr_t)))&&(rb>0)); ind2ctr++) { ind2ptr=&((ufs1_daddr_t *)(void *)&i2blk) [ind2ctr]; if (bread(&disk, fsbtodb(&sblock, *ind2ptr), (void *)&i1blk, (size_t)sblock.fs_bsize) == -1) { err(1, "bread: %s", disk.d_error); } snprintf(comment, sizeof(comment), "Inode 0x%08jx: indirect 2->%d->%d", (uintmax_t)inode, ind3ctr, ind3ctr); DBG_DUMP_IBLK(&sblock, comment, i1blk, (size_t)rb); rb-=howmany(sblock.fs_bsize, sizeof(ufs1_daddr_t)); } } } DBG_LEAVE; return; } /* ********************************************** dump_whole_ufs2_inode ***** */ /* * Here we dump a list of all blocks allocated by this inode. We follow * all indirect blocks. */ void dump_whole_ufs2_inode(ino_t inode, int level) { DBG_FUNC("dump_whole_ufs2_inode") struct ufs2_dinode *ino; int rb, mode; unsigned int ind2ctr, ind3ctr; ufs2_daddr_t *ind2ptr, *ind3ptr; char comment[80]; DBG_ENTER; /* * Read the inode from disk/cache. */ if (getino(&disk, (void **)&ino, inode, &mode) == -1) err(1, "getino: %s", disk.d_error); if (ino->di_nlink == 0) { DBG_LEAVE; return; /* inode not in use */ } /* * Dump the main inode structure. */ snprintf(comment, sizeof(comment), "Inode 0x%08jx", (uintmax_t)inode); if (level & 0x100) { DBG_DUMP_INO(&sblock, comment, ino); } if (!(level & 0x200)) { DBG_LEAVE; return; } /* * Ok, now prepare for dumping all direct and indirect pointers. */ - rb = howmany(ino->di_size, sblock.fs_bsize) - NDADDR; + rb = howmany(ino->di_size, sblock.fs_bsize) - UFS_NDADDR; if (rb > 0) { /* * Dump single indirect block. */ if (bread(&disk, fsbtodb(&sblock, ino->di_ib[0]), (void *)&i1blk, (size_t)sblock.fs_bsize) == -1) { err(1, "bread: %s", disk.d_error); } snprintf(comment, sizeof(comment), "Inode 0x%08jx: indirect 0", (uintmax_t)inode); DBG_DUMP_IBLK(&sblock, comment, i1blk, (size_t)rb); rb -= howmany(sblock.fs_bsize, sizeof(ufs2_daddr_t)); } if (rb > 0) { /* * Dump double indirect blocks. */ if (bread(&disk, fsbtodb(&sblock, ino->di_ib[1]), (void *)&i2blk, (size_t)sblock.fs_bsize) == -1) { err(1, "bread: %s", disk.d_error); } snprintf(comment, sizeof(comment), "Inode 0x%08jx: indirect 1", (uintmax_t)inode); DBG_DUMP_IBLK(&sblock, comment, i2blk, howmany(rb, howmany(sblock.fs_bsize, sizeof(ufs2_daddr_t)))); for (ind2ctr = 0; ((ind2ctr < howmany(sblock.fs_bsize, sizeof(ufs2_daddr_t))) && (rb>0)); ind2ctr++) { ind2ptr = &((ufs2_daddr_t *)(void *)&i2blk)[ind2ctr]; if (bread(&disk, fsbtodb(&sblock, *ind2ptr), (void *)&i1blk, (size_t)sblock.fs_bsize) == -1) { err(1, "bread: %s", disk.d_error); } snprintf(comment, sizeof(comment), "Inode 0x%08jx: indirect 1->%d", (uintmax_t)inode, ind2ctr); DBG_DUMP_IBLK(&sblock, comment, i1blk, (size_t)rb); rb -= howmany(sblock.fs_bsize, sizeof(ufs2_daddr_t)); } } if (rb > 0) { /* * Dump triple indirect blocks. */ if (bread(&disk, fsbtodb(&sblock, ino->di_ib[2]), (void *)&i3blk, (size_t)sblock.fs_bsize) == -1) { err(1, "bread: %s", disk.d_error); } snprintf(comment, sizeof(comment), "Inode 0x%08jx: indirect 2", (uintmax_t)inode); #define SQUARE(a) ((a)*(a)) DBG_DUMP_IBLK(&sblock, comment, i3blk, howmany(rb, SQUARE(howmany(sblock.fs_bsize, sizeof(ufs2_daddr_t))))); #undef SQUARE for (ind3ctr = 0; ((ind3ctr < howmany(sblock.fs_bsize, sizeof(ufs2_daddr_t))) && (rb > 0)); ind3ctr++) { ind3ptr = &((ufs2_daddr_t *)(void *)&i3blk)[ind3ctr]; if (bread(&disk, fsbtodb(&sblock, *ind3ptr), (void *)&i2blk, (size_t)sblock.fs_bsize) == -1) { err(1, "bread: %s", disk.d_error); } snprintf(comment, sizeof(comment), "Inode 0x%08jx: indirect 2->%d", (uintmax_t)inode, ind3ctr); DBG_DUMP_IBLK(&sblock, comment, i2blk, howmany(rb, howmany(sblock.fs_bsize, sizeof(ufs2_daddr_t)))); for (ind2ctr = 0; ((ind2ctr < howmany(sblock.fs_bsize, sizeof(ufs2_daddr_t))) && (rb > 0)); ind2ctr++) { ind2ptr = &((ufs2_daddr_t *)(void *)&i2blk) [ind2ctr]; if (bread(&disk, fsbtodb(&sblock, *ind2ptr), (void *)&i1blk, (size_t)sblock.fs_bsize) == -1) { err(1, "bread: %s", disk.d_error); } snprintf(comment, sizeof(comment), "Inode 0x%08jx: indirect 2->%d->%d", (uintmax_t)inode, ind3ctr, ind3ctr); DBG_DUMP_IBLK(&sblock, comment, i1blk, (size_t)rb); rb -= howmany(sblock.fs_bsize, sizeof(ufs2_daddr_t)); } } } DBG_LEAVE; return; } /* ************************************************************* usage ***** */ /* * Dump a line of usage. */ void usage(void) { DBG_FUNC("usage") DBG_ENTER; fprintf(stderr, "usage: ffsinfo [-g cylinder_group] [-i inode] [-l level] " "[-o outfile]\n" " special | file\n"); DBG_LEAVE; exit(1); } Index: head/sbin/fsck_ffs/dir.c =================================================================== --- head/sbin/fsck_ffs/dir.c (revision 313779) +++ head/sbin/fsck_ffs/dir.c (revision 313780) @@ -1,709 +1,710 @@ /* * Copyright (c) 1980, 1986, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #if 0 #ifndef lint static const char sccsid[] = "@(#)dir.c 8.8 (Berkeley) 4/28/95"; #endif /* not lint */ #endif #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include "fsck.h" static struct dirtemplate emptydir = { 0, DIRBLKSIZ, DT_UNKNOWN, 0, "", 0, 0, DT_UNKNOWN, 0, "" }; static struct dirtemplate dirhead = { 0, 12, DT_DIR, 1, ".", 0, DIRBLKSIZ - 12, DT_DIR, 2, ".." }; static int chgino(struct inodesc *); static int dircheck(struct inodesc *, struct direct *); static int expanddir(union dinode *dp, char *name); static void freedir(ino_t ino, ino_t parent); static struct direct *fsck_readdir(struct inodesc *); static struct bufarea *getdirblk(ufs2_daddr_t blkno, long size); static int lftempname(char *bufp, ino_t ino); static int mkentry(struct inodesc *); /* * Propagate connected state through the tree. */ void propagate(void) { struct inoinfo **inpp, *inp; struct inoinfo **inpend; long change; inpend = &inpsort[inplast]; do { change = 0; for (inpp = inpsort; inpp < inpend; inpp++) { inp = *inpp; if (inp->i_parent == 0) continue; if (inoinfo(inp->i_parent)->ino_state == DFOUND && INO_IS_DUNFOUND(inp->i_number)) { inoinfo(inp->i_number)->ino_state = DFOUND; change++; } } } while (change > 0); } /* * Scan each entry in a directory block. */ int dirscan(struct inodesc *idesc) { struct direct *dp; struct bufarea *bp; u_int dsize, n; long blksiz; char dbuf[DIRBLKSIZ]; if (idesc->id_type != DATA) errx(EEXIT, "wrong type to dirscan %d", idesc->id_type); if (idesc->id_entryno == 0 && (idesc->id_filesize & (DIRBLKSIZ - 1)) != 0) idesc->id_filesize = roundup(idesc->id_filesize, DIRBLKSIZ); blksiz = idesc->id_numfrags * sblock.fs_fsize; if (chkrange(idesc->id_blkno, idesc->id_numfrags)) { idesc->id_filesize -= blksiz; return (SKIP); } idesc->id_loc = 0; for (dp = fsck_readdir(idesc); dp != NULL; dp = fsck_readdir(idesc)) { dsize = dp->d_reclen; if (dsize > sizeof(dbuf)) dsize = sizeof(dbuf); memmove(dbuf, dp, (size_t)dsize); idesc->id_dirp = (struct direct *)dbuf; if ((n = (*idesc->id_func)(idesc)) & ALTERED) { bp = getdirblk(idesc->id_blkno, blksiz); memmove(bp->b_un.b_buf + idesc->id_loc - dsize, dbuf, (size_t)dsize); dirty(bp); sbdirty(); rerun = 1; } if (n & STOP) return (n); } return (idesc->id_filesize > 0 ? KEEPON : STOP); } /* * get next entry in a directory. */ static struct direct * fsck_readdir(struct inodesc *idesc) { struct direct *dp, *ndp; struct bufarea *bp; long size, blksiz, fix, dploc; blksiz = idesc->id_numfrags * sblock.fs_fsize; bp = getdirblk(idesc->id_blkno, blksiz); if (idesc->id_loc % DIRBLKSIZ == 0 && idesc->id_filesize > 0 && idesc->id_loc < blksiz) { dp = (struct direct *)(bp->b_un.b_buf + idesc->id_loc); if (dircheck(idesc, dp)) goto dpok; if (idesc->id_fix == IGNORE) return (0); fix = dofix(idesc, "DIRECTORY CORRUPTED"); bp = getdirblk(idesc->id_blkno, blksiz); dp = (struct direct *)(bp->b_un.b_buf + idesc->id_loc); dp->d_reclen = DIRBLKSIZ; dp->d_ino = 0; dp->d_type = 0; dp->d_namlen = 0; dp->d_name[0] = '\0'; if (fix) dirty(bp); idesc->id_loc += DIRBLKSIZ; idesc->id_filesize -= DIRBLKSIZ; return (dp); } dpok: if (idesc->id_filesize <= 0 || idesc->id_loc >= blksiz) return NULL; dploc = idesc->id_loc; dp = (struct direct *)(bp->b_un.b_buf + dploc); idesc->id_loc += dp->d_reclen; idesc->id_filesize -= dp->d_reclen; if ((idesc->id_loc % DIRBLKSIZ) == 0) return (dp); ndp = (struct direct *)(bp->b_un.b_buf + idesc->id_loc); if (idesc->id_loc < blksiz && idesc->id_filesize > 0 && dircheck(idesc, ndp) == 0) { size = DIRBLKSIZ - (idesc->id_loc % DIRBLKSIZ); idesc->id_loc += size; idesc->id_filesize -= size; if (idesc->id_fix == IGNORE) return (0); fix = dofix(idesc, "DIRECTORY CORRUPTED"); bp = getdirblk(idesc->id_blkno, blksiz); dp = (struct direct *)(bp->b_un.b_buf + dploc); dp->d_reclen += size; if (fix) dirty(bp); } return (dp); } /* * Verify that a directory entry is valid. * This is a superset of the checks made in the kernel. */ static int dircheck(struct inodesc *idesc, struct direct *dp) { size_t size; char *cp; u_char type; u_int8_t namlen; int spaceleft; spaceleft = DIRBLKSIZ - (idesc->id_loc % DIRBLKSIZ); if (dp->d_reclen == 0 || dp->d_reclen > spaceleft || (dp->d_reclen & 0x3) != 0) goto bad; if (dp->d_ino == 0) return (1); size = DIRSIZ(0, dp); namlen = dp->d_namlen; type = dp->d_type; if (dp->d_reclen < size || idesc->id_filesize < size || namlen == 0 || type > 15) goto bad; for (cp = dp->d_name, size = 0; size < namlen; size++) if (*cp == '\0' || (*cp++ == '/')) goto bad; if (*cp != '\0') goto bad; return (1); bad: if (debug) printf("Bad dir: ino %d reclen %d namlen %d type %d name %s\n", dp->d_ino, dp->d_reclen, dp->d_namlen, dp->d_type, dp->d_name); return (0); } void direrror(ino_t ino, const char *errmesg) { fileerror(ino, ino, errmesg); } void fileerror(ino_t cwd, ino_t ino, const char *errmesg) { union dinode *dp; char pathbuf[MAXPATHLEN + 1]; pwarn("%s ", errmesg); pinode(ino); printf("\n"); getpathname(pathbuf, cwd, ino); - if (ino < ROOTINO || ino > maxino) { + if (ino < UFS_ROOTINO || ino > maxino) { pfatal("NAME=%s\n", pathbuf); return; } dp = ginode(ino); if (ftypeok(dp)) pfatal("%s=%s\n", (DIP(dp, di_mode) & IFMT) == IFDIR ? "DIR" : "FILE", pathbuf); else pfatal("NAME=%s\n", pathbuf); } void adjust(struct inodesc *idesc, int lcnt) { union dinode *dp; int saveresolved; dp = ginode(idesc->id_number); if (DIP(dp, di_nlink) == lcnt) { /* * If we have not hit any unresolved problems, are running * in preen mode, and are on a file system using soft updates, * then just toss any partially allocated files. */ if (resolved && (preen || bkgrdflag) && usedsoftdep) { clri(idesc, "UNREF", 1); return; } else { /* * The file system can be marked clean even if * a file is not linked up, but is cleared. * Hence, resolved should not be cleared when * linkup is answered no, but clri is answered yes. */ saveresolved = resolved; if (linkup(idesc->id_number, (ino_t)0, NULL) == 0) { resolved = saveresolved; clri(idesc, "UNREF", 0); return; } /* * Account for the new reference created by linkup(). */ dp = ginode(idesc->id_number); lcnt--; } } if (lcnt != 0) { pwarn("LINK COUNT %s", (lfdir == idesc->id_number) ? lfname : ((DIP(dp, di_mode) & IFMT) == IFDIR ? "DIR" : "FILE")); pinode(idesc->id_number); printf(" COUNT %d SHOULD BE %d", DIP(dp, di_nlink), DIP(dp, di_nlink) - lcnt); if (preen || usedsoftdep) { if (lcnt < 0) { printf("\n"); pfatal("LINK COUNT INCREASING"); } if (preen) printf(" (ADJUSTED)\n"); } if (preen || reply("ADJUST") == 1) { if (bkgrdflag == 0) { DIP_SET(dp, di_nlink, DIP(dp, di_nlink) - lcnt); inodirty(); } else { cmd.value = idesc->id_number; cmd.size = -lcnt; if (debug) printf("adjrefcnt ino %ld amt %lld\n", (long)cmd.value, (long long)cmd.size); if (sysctl(adjrefcnt, MIBSIZE, 0, 0, &cmd, sizeof cmd) == -1) rwerror("ADJUST INODE", cmd.value); } } } } static int mkentry(struct inodesc *idesc) { struct direct *dirp = idesc->id_dirp; struct direct newent; int newlen, oldlen; newent.d_namlen = strlen(idesc->id_name); newlen = DIRSIZ(0, &newent); if (dirp->d_ino != 0) oldlen = DIRSIZ(0, dirp); else oldlen = 0; if (dirp->d_reclen - oldlen < newlen) return (KEEPON); newent.d_reclen = dirp->d_reclen - oldlen; dirp->d_reclen = oldlen; dirp = (struct direct *)(((char *)dirp) + oldlen); dirp->d_ino = idesc->id_parent; /* ino to be entered is in id_parent */ dirp->d_reclen = newent.d_reclen; dirp->d_type = inoinfo(idesc->id_parent)->ino_type; dirp->d_namlen = newent.d_namlen; memmove(dirp->d_name, idesc->id_name, (size_t)newent.d_namlen + 1); return (ALTERED|STOP); } static int chgino(struct inodesc *idesc) { struct direct *dirp = idesc->id_dirp; if (memcmp(dirp->d_name, idesc->id_name, (int)dirp->d_namlen + 1)) return (KEEPON); dirp->d_ino = idesc->id_parent; dirp->d_type = inoinfo(idesc->id_parent)->ino_type; return (ALTERED|STOP); } int linkup(ino_t orphan, ino_t parentdir, char *name) { union dinode *dp; int lostdir; ino_t oldlfdir; struct inodesc idesc; char tempname[BUFSIZ]; memset(&idesc, 0, sizeof(struct inodesc)); dp = ginode(orphan); lostdir = (DIP(dp, di_mode) & IFMT) == IFDIR; pwarn("UNREF %s ", lostdir ? "DIR" : "FILE"); pinode(orphan); if (preen && DIP(dp, di_size) == 0) return (0); if (cursnapshot != 0) { pfatal("FILE LINKUP IN SNAPSHOT"); return (0); } if (preen) printf(" (RECONNECTED)\n"); else if (reply("RECONNECT") == 0) return (0); if (lfdir == 0) { - dp = ginode(ROOTINO); + dp = ginode(UFS_ROOTINO); idesc.id_name = strdup(lfname); idesc.id_type = DATA; idesc.id_func = findino; - idesc.id_number = ROOTINO; + idesc.id_number = UFS_ROOTINO; if ((ckinode(dp, &idesc) & FOUND) != 0) { lfdir = idesc.id_parent; } else { pwarn("NO lost+found DIRECTORY"); if (preen || reply("CREATE")) { - lfdir = allocdir(ROOTINO, (ino_t)0, lfmode); + lfdir = allocdir(UFS_ROOTINO, (ino_t)0, lfmode); if (lfdir != 0) { - if (makeentry(ROOTINO, lfdir, lfname) != 0) { + if (makeentry(UFS_ROOTINO, lfdir, + lfname) != 0) { numdirs++; if (preen) printf(" (CREATED)\n"); } else { - freedir(lfdir, ROOTINO); + freedir(lfdir, UFS_ROOTINO); lfdir = 0; if (preen) printf("\n"); } } } } if (lfdir == 0) { pfatal("SORRY. CANNOT CREATE lost+found DIRECTORY"); printf("\n\n"); return (0); } } dp = ginode(lfdir); if ((DIP(dp, di_mode) & IFMT) != IFDIR) { pfatal("lost+found IS NOT A DIRECTORY"); if (reply("REALLOCATE") == 0) return (0); oldlfdir = lfdir; - if ((lfdir = allocdir(ROOTINO, (ino_t)0, lfmode)) == 0) { + if ((lfdir = allocdir(UFS_ROOTINO, (ino_t)0, lfmode)) == 0) { pfatal("SORRY. CANNOT CREATE lost+found DIRECTORY\n\n"); return (0); } - if ((changeino(ROOTINO, lfname, lfdir) & ALTERED) == 0) { + if ((changeino(UFS_ROOTINO, lfname, lfdir) & ALTERED) == 0) { pfatal("SORRY. CANNOT CREATE lost+found DIRECTORY\n\n"); return (0); } inodirty(); idesc.id_type = ADDR; idesc.id_func = pass4check; idesc.id_number = oldlfdir; adjust(&idesc, inoinfo(oldlfdir)->ino_linkcnt + 1); inoinfo(oldlfdir)->ino_linkcnt = 0; dp = ginode(lfdir); } if (inoinfo(lfdir)->ino_state != DFOUND) { pfatal("SORRY. NO lost+found DIRECTORY\n\n"); return (0); } (void)lftempname(tempname, orphan); if (makeentry(lfdir, orphan, (name ? name : tempname)) == 0) { pfatal("SORRY. NO SPACE IN lost+found DIRECTORY"); printf("\n\n"); return (0); } inoinfo(orphan)->ino_linkcnt--; if (lostdir) { if ((changeino(orphan, "..", lfdir) & ALTERED) == 0 && parentdir != (ino_t)-1) (void)makeentry(orphan, lfdir, ".."); dp = ginode(lfdir); DIP_SET(dp, di_nlink, DIP(dp, di_nlink) + 1); inodirty(); inoinfo(lfdir)->ino_linkcnt++; pwarn("DIR I=%lu CONNECTED. ", (u_long)orphan); if (parentdir != (ino_t)-1) { printf("PARENT WAS I=%lu\n", (u_long)parentdir); /* * The parent directory, because of the ordering * guarantees, has had the link count incremented * for the child, but no entry was made. This * fixes the parent link count so that fsck does * not need to be rerun. */ inoinfo(parentdir)->ino_linkcnt++; } if (preen == 0) printf("\n"); } return (1); } /* * fix an entry in a directory. */ int changeino(ino_t dir, const char *name, ino_t newnum) { struct inodesc idesc; memset(&idesc, 0, sizeof(struct inodesc)); idesc.id_type = DATA; idesc.id_func = chgino; idesc.id_number = dir; idesc.id_fix = DONTKNOW; idesc.id_name = strdup(name); idesc.id_parent = newnum; /* new value for name */ return (ckinode(ginode(dir), &idesc)); } /* * make an entry in a directory */ int makeentry(ino_t parent, ino_t ino, const char *name) { union dinode *dp; struct inodesc idesc; char pathbuf[MAXPATHLEN + 1]; - if (parent < ROOTINO || parent >= maxino || - ino < ROOTINO || ino >= maxino) + if (parent < UFS_ROOTINO || parent >= maxino || + ino < UFS_ROOTINO || ino >= maxino) return (0); memset(&idesc, 0, sizeof(struct inodesc)); idesc.id_type = DATA; idesc.id_func = mkentry; idesc.id_number = parent; idesc.id_parent = ino; /* this is the inode to enter */ idesc.id_fix = DONTKNOW; idesc.id_name = strdup(name); dp = ginode(parent); if (DIP(dp, di_size) % DIRBLKSIZ) { DIP_SET(dp, di_size, roundup(DIP(dp, di_size), DIRBLKSIZ)); inodirty(); } if ((ckinode(dp, &idesc) & ALTERED) != 0) return (1); getpathname(pathbuf, parent, parent); dp = ginode(parent); if (expanddir(dp, pathbuf) == 0) return (0); return (ckinode(dp, &idesc) & ALTERED); } /* * Attempt to expand the size of a directory */ static int expanddir(union dinode *dp, char *name) { ufs2_daddr_t lastbn, newblk; struct bufarea *bp; char *cp, firstblk[DIRBLKSIZ]; lastbn = lblkno(&sblock, DIP(dp, di_size)); - if (lastbn >= NDADDR - 1 || DIP(dp, di_db[lastbn]) == 0 || + if (lastbn >= UFS_NDADDR - 1 || DIP(dp, di_db[lastbn]) == 0 || DIP(dp, di_size) == 0) return (0); if ((newblk = allocblk(sblock.fs_frag)) == 0) return (0); DIP_SET(dp, di_db[lastbn + 1], DIP(dp, di_db[lastbn])); DIP_SET(dp, di_db[lastbn], newblk); DIP_SET(dp, di_size, DIP(dp, di_size) + sblock.fs_bsize); DIP_SET(dp, di_blocks, DIP(dp, di_blocks) + btodb(sblock.fs_bsize)); bp = getdirblk(DIP(dp, di_db[lastbn + 1]), sblksize(&sblock, DIP(dp, di_size), lastbn + 1)); if (bp->b_errs) goto bad; memmove(firstblk, bp->b_un.b_buf, DIRBLKSIZ); bp = getdirblk(newblk, sblock.fs_bsize); if (bp->b_errs) goto bad; memmove(bp->b_un.b_buf, firstblk, DIRBLKSIZ); for (cp = &bp->b_un.b_buf[DIRBLKSIZ]; cp < &bp->b_un.b_buf[sblock.fs_bsize]; cp += DIRBLKSIZ) memmove(cp, &emptydir, sizeof emptydir); dirty(bp); bp = getdirblk(DIP(dp, di_db[lastbn + 1]), sblksize(&sblock, DIP(dp, di_size), lastbn + 1)); if (bp->b_errs) goto bad; memmove(bp->b_un.b_buf, &emptydir, sizeof emptydir); pwarn("NO SPACE LEFT IN %s", name); if (preen) printf(" (EXPANDED)\n"); else if (reply("EXPAND") == 0) goto bad; dirty(bp); inodirty(); return (1); bad: DIP_SET(dp, di_db[lastbn], DIP(dp, di_db[lastbn + 1])); DIP_SET(dp, di_db[lastbn + 1], 0); DIP_SET(dp, di_size, DIP(dp, di_size) - sblock.fs_bsize); DIP_SET(dp, di_blocks, DIP(dp, di_blocks) - btodb(sblock.fs_bsize)); freeblk(newblk, sblock.fs_frag); return (0); } /* * allocate a new directory */ ino_t allocdir(ino_t parent, ino_t request, int mode) { ino_t ino; char *cp; union dinode *dp; struct bufarea *bp; struct inoinfo *inp; struct dirtemplate *dirp; ino = allocino(request, IFDIR|mode); dirp = &dirhead; dirp->dot_ino = ino; dirp->dotdot_ino = parent; dp = ginode(ino); bp = getdirblk(DIP(dp, di_db[0]), sblock.fs_fsize); if (bp->b_errs) { freeino(ino); return (0); } memmove(bp->b_un.b_buf, dirp, sizeof(struct dirtemplate)); for (cp = &bp->b_un.b_buf[DIRBLKSIZ]; cp < &bp->b_un.b_buf[sblock.fs_fsize]; cp += DIRBLKSIZ) memmove(cp, &emptydir, sizeof emptydir); dirty(bp); DIP_SET(dp, di_nlink, 2); inodirty(); - if (ino == ROOTINO) { + if (ino == UFS_ROOTINO) { inoinfo(ino)->ino_linkcnt = DIP(dp, di_nlink); cacheino(dp, ino); return(ino); } if (!INO_IS_DVALID(parent)) { freeino(ino); return (0); } cacheino(dp, ino); inp = getinoinfo(ino); inp->i_parent = parent; inp->i_dotdot = parent; inoinfo(ino)->ino_state = inoinfo(parent)->ino_state; if (inoinfo(ino)->ino_state == DSTATE) { inoinfo(ino)->ino_linkcnt = DIP(dp, di_nlink); inoinfo(parent)->ino_linkcnt++; } dp = ginode(parent); DIP_SET(dp, di_nlink, DIP(dp, di_nlink) + 1); inodirty(); return (ino); } /* * free a directory inode */ static void freedir(ino_t ino, ino_t parent) { union dinode *dp; if (ino != parent) { dp = ginode(parent); DIP_SET(dp, di_nlink, DIP(dp, di_nlink) - 1); inodirty(); } freeino(ino); } /* * generate a temporary name for the lost+found directory. */ static int lftempname(char *bufp, ino_t ino) { ino_t in; char *cp; int namlen; cp = bufp + 2; for (in = maxino; in > 0; in /= 10) cp++; *--cp = 0; namlen = cp - bufp; in = ino; while (cp > bufp) { *--cp = (in % 10) + '0'; in /= 10; } *cp = '#'; return (namlen); } /* * Get a directory block. * Insure that it is held until another is requested. */ static struct bufarea * getdirblk(ufs2_daddr_t blkno, long size) { if (pdirbp != NULL) pdirbp->b_flags &= ~B_INUSE; pdirbp = getdatablk(blkno, size, BT_DIRDATA); return (pdirbp); } Index: head/sbin/fsck_ffs/fsutil.c =================================================================== --- head/sbin/fsck_ffs/fsutil.c (revision 313779) +++ head/sbin/fsck_ffs/fsutil.c (revision 313780) @@ -1,1044 +1,1044 @@ /* * Copyright (c) 1980, 1986, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #if 0 #ifndef lint static const char sccsid[] = "@(#)utilities.c 8.6 (Berkeley) 5/19/95"; #endif /* not lint */ #endif #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "fsck.h" static void slowio_start(void); static void slowio_end(void); static void printIOstats(void); static long diskreads, totaldiskreads, totalreads; /* Disk cache statistics */ static struct timespec startpass, finishpass; struct timeval slowio_starttime; int slowio_delay_usec = 10000; /* Initial IO delay for background fsck */ int slowio_pollcnt; static struct bufarea cgblk; /* backup buffer for cylinder group blocks */ static TAILQ_HEAD(buflist, bufarea) bufhead; /* head of buffer cache list */ static int numbufs; /* size of buffer cache */ static char *buftype[BT_NUMBUFTYPES] = BT_NAMES; static struct bufarea *cgbufs; /* header for cylinder group cache */ static int flushtries; /* number of tries to reclaim memory */ void fsutilinit(void) { diskreads = totaldiskreads = totalreads = 0; bzero(&startpass, sizeof(struct timespec)); bzero(&finishpass, sizeof(struct timespec)); bzero(&slowio_starttime, sizeof(struct timeval)); slowio_delay_usec = 10000; slowio_pollcnt = 0; bzero(&cgblk, sizeof(struct bufarea)); TAILQ_INIT(&bufhead); numbufs = 0; /* buftype ? */ cgbufs = NULL; flushtries = 0; } int ftypeok(union dinode *dp) { switch (DIP(dp, di_mode) & IFMT) { case IFDIR: case IFREG: case IFBLK: case IFCHR: case IFLNK: case IFSOCK: case IFIFO: return (1); default: if (debug) printf("bad file type 0%o\n", DIP(dp, di_mode)); return (0); } } int reply(const char *question) { int persevere; char c; if (preen) pfatal("INTERNAL ERROR: GOT TO reply()"); persevere = !strcmp(question, "CONTINUE"); printf("\n"); if (!persevere && (nflag || (fswritefd < 0 && bkgrdflag == 0))) { printf("%s? no\n\n", question); resolved = 0; return (0); } if (yflag || (persevere && nflag)) { printf("%s? yes\n\n", question); return (1); } do { printf("%s? [yn] ", question); (void) fflush(stdout); c = getc(stdin); while (c != '\n' && getc(stdin) != '\n') { if (feof(stdin)) { resolved = 0; return (0); } } } while (c != 'y' && c != 'Y' && c != 'n' && c != 'N'); printf("\n"); if (c == 'y' || c == 'Y') return (1); resolved = 0; return (0); } /* * Look up state information for an inode. */ struct inostat * inoinfo(ino_t inum) { static struct inostat unallocated = { USTATE, 0, 0 }; struct inostatlist *ilp; int iloff; if (inum > maxino) errx(EEXIT, "inoinfo: inumber %ju out of range", (uintmax_t)inum); ilp = &inostathead[inum / sblock.fs_ipg]; iloff = inum % sblock.fs_ipg; if (iloff >= ilp->il_numalloced) return (&unallocated); return (&ilp->il_stat[iloff]); } /* * Malloc buffers and set up cache. */ void bufinit(void) { struct bufarea *bp; long bufcnt, i; char *bufp; pbp = pdirbp = (struct bufarea *)0; bufp = Malloc((unsigned int)sblock.fs_bsize); if (bufp == NULL) errx(EEXIT, "cannot allocate buffer pool"); cgblk.b_un.b_buf = bufp; initbarea(&cgblk, BT_CYLGRP); TAILQ_INIT(&bufhead); bufcnt = MAXBUFS; if (bufcnt < MINBUFS) bufcnt = MINBUFS; for (i = 0; i < bufcnt; i++) { bp = (struct bufarea *)Malloc(sizeof(struct bufarea)); bufp = Malloc((unsigned int)sblock.fs_bsize); if (bp == NULL || bufp == NULL) { if (i >= MINBUFS) break; errx(EEXIT, "cannot allocate buffer pool"); } bp->b_un.b_buf = bufp; TAILQ_INSERT_HEAD(&bufhead, bp, b_list); initbarea(bp, BT_UNKNOWN); } numbufs = i; /* save number of buffers */ for (i = 0; i < BT_NUMBUFTYPES; i++) { readtime[i].tv_sec = totalreadtime[i].tv_sec = 0; readtime[i].tv_nsec = totalreadtime[i].tv_nsec = 0; readcnt[i] = totalreadcnt[i] = 0; } } /* * Manage cylinder group buffers. */ static struct bufarea *cgbufs; /* header for cylinder group cache */ static int flushtries; /* number of tries to reclaim memory */ struct bufarea * cgget(int cg) { struct bufarea *cgbp; struct cg *cgp; if (cgbufs == NULL) { cgbufs = calloc(sblock.fs_ncg, sizeof(struct bufarea)); if (cgbufs == NULL) errx(EEXIT, "cannot allocate cylinder group buffers"); } cgbp = &cgbufs[cg]; if (cgbp->b_un.b_cg != NULL) return (cgbp); cgp = NULL; if (flushtries == 0) cgp = malloc((unsigned int)sblock.fs_cgsize); if (cgp == NULL) { getblk(&cgblk, cgtod(&sblock, cg), sblock.fs_cgsize); return (&cgblk); } cgbp->b_un.b_cg = cgp; initbarea(cgbp, BT_CYLGRP); getblk(cgbp, cgtod(&sblock, cg), sblock.fs_cgsize); return (cgbp); } /* * Attempt to flush a cylinder group cache entry. * Return whether the flush was successful. */ int flushentry(void) { struct bufarea *cgbp; if (flushtries == sblock.fs_ncg || cgbufs == NULL) return (0); cgbp = &cgbufs[flushtries++]; if (cgbp->b_un.b_cg == NULL) return (0); flush(fswritefd, cgbp); free(cgbp->b_un.b_buf); cgbp->b_un.b_buf = NULL; return (1); } /* * Manage a cache of directory blocks. */ struct bufarea * getdatablk(ufs2_daddr_t blkno, long size, int type) { struct bufarea *bp; TAILQ_FOREACH(bp, &bufhead, b_list) if (bp->b_bno == fsbtodb(&sblock, blkno)) goto foundit; TAILQ_FOREACH_REVERSE(bp, &bufhead, buflist, b_list) if ((bp->b_flags & B_INUSE) == 0) break; if (bp == NULL) errx(EEXIT, "deadlocked buffer pool"); bp->b_type = type; getblk(bp, blkno, size); /* fall through */ foundit: if (debug && bp->b_type != type) printf("Buffer type changed from %s to %s\n", buftype[bp->b_type], buftype[type]); TAILQ_REMOVE(&bufhead, bp, b_list); TAILQ_INSERT_HEAD(&bufhead, bp, b_list); bp->b_flags |= B_INUSE; return (bp); } /* * Timespec operations (from ). */ #define timespecsub(vvp, uvp) \ do { \ (vvp)->tv_sec -= (uvp)->tv_sec; \ (vvp)->tv_nsec -= (uvp)->tv_nsec; \ if ((vvp)->tv_nsec < 0) { \ (vvp)->tv_sec--; \ (vvp)->tv_nsec += 1000000000; \ } \ } while (0) #define timespecadd(vvp, uvp) \ do { \ (vvp)->tv_sec += (uvp)->tv_sec; \ (vvp)->tv_nsec += (uvp)->tv_nsec; \ if ((vvp)->tv_nsec >= 1000000000) { \ (vvp)->tv_sec++; \ (vvp)->tv_nsec -= 1000000000; \ } \ } while (0) void getblk(struct bufarea *bp, ufs2_daddr_t blk, long size) { ufs2_daddr_t dblk; struct timespec start, finish; dblk = fsbtodb(&sblock, blk); if (bp->b_bno == dblk) { totalreads++; } else { flush(fswritefd, bp); if (debug) { readcnt[bp->b_type]++; clock_gettime(CLOCK_REALTIME_PRECISE, &start); } bp->b_errs = blread(fsreadfd, bp->b_un.b_buf, dblk, size); if (debug) { clock_gettime(CLOCK_REALTIME_PRECISE, &finish); timespecsub(&finish, &start); timespecadd(&readtime[bp->b_type], &finish); } bp->b_bno = dblk; bp->b_size = size; } } void flush(int fd, struct bufarea *bp) { int i, j; if (!bp->b_dirty) return; bp->b_dirty = 0; if (fswritefd < 0) { pfatal("WRITING IN READ_ONLY MODE.\n"); return; } if (bp->b_errs != 0) pfatal("WRITING %sZERO'ED BLOCK %lld TO DISK\n", (bp->b_errs == bp->b_size / dev_bsize) ? "" : "PARTIALLY ", (long long)bp->b_bno); bp->b_errs = 0; blwrite(fd, bp->b_un.b_buf, bp->b_bno, bp->b_size); if (bp != &sblk) return; for (i = 0, j = 0; i < sblock.fs_cssize; i += sblock.fs_bsize, j++) { blwrite(fswritefd, (char *)sblock.fs_csp + i, fsbtodb(&sblock, sblock.fs_csaddr + j * sblock.fs_frag), MIN(sblock.fs_cssize - i, sblock.fs_bsize)); } } void rwerror(const char *mesg, ufs2_daddr_t blk) { if (bkgrdcheck) exit(EEXIT); if (preen == 0) printf("\n"); pfatal("CANNOT %s: %ld", mesg, (long)blk); if (reply("CONTINUE") == 0) exit(EEXIT); } void ckfini(int markclean) { struct bufarea *bp, *nbp; int ofsmodified, cnt; if (bkgrdflag) { unlink(snapname); if ((!(sblock.fs_flags & FS_UNCLEAN)) != markclean) { cmd.value = FS_UNCLEAN; cmd.size = markclean ? -1 : 1; if (sysctlbyname("vfs.ffs.setflags", 0, 0, &cmd, sizeof cmd) == -1) rwerror("SET FILE SYSTEM FLAGS", FS_UNCLEAN); if (!preen) { printf("\n***** FILE SYSTEM MARKED %s *****\n", markclean ? "CLEAN" : "DIRTY"); if (!markclean) rerun = 1; } } else if (!preen && !markclean) { printf("\n***** FILE SYSTEM STILL DIRTY *****\n"); rerun = 1; } } if (debug && totalreads > 0) printf("cache with %d buffers missed %ld of %ld (%d%%)\n", numbufs, totaldiskreads, totalreads, (int)(totaldiskreads * 100 / totalreads)); if (fswritefd < 0) { (void)close(fsreadfd); return; } flush(fswritefd, &sblk); if (havesb && cursnapshot == 0 && sblock.fs_magic == FS_UFS2_MAGIC && sblk.b_bno != sblock.fs_sblockloc / dev_bsize && !preen && reply("UPDATE STANDARD SUPERBLOCK")) { sblk.b_bno = sblock.fs_sblockloc / dev_bsize; sbdirty(); flush(fswritefd, &sblk); } flush(fswritefd, &cgblk); free(cgblk.b_un.b_buf); cnt = 0; TAILQ_FOREACH_REVERSE_SAFE(bp, &bufhead, buflist, b_list, nbp) { TAILQ_REMOVE(&bufhead, bp, b_list); cnt++; flush(fswritefd, bp); free(bp->b_un.b_buf); free((char *)bp); } if (numbufs != cnt) errx(EEXIT, "panic: lost %d buffers", numbufs - cnt); if (cgbufs != NULL) { for (cnt = 0; cnt < sblock.fs_ncg; cnt++) { if (cgbufs[cnt].b_un.b_cg == NULL) continue; flush(fswritefd, &cgbufs[cnt]); free(cgbufs[cnt].b_un.b_cg); } free(cgbufs); } pbp = pdirbp = (struct bufarea *)0; if (cursnapshot == 0 && sblock.fs_clean != markclean) { if ((sblock.fs_clean = markclean) != 0) { sblock.fs_flags &= ~(FS_UNCLEAN | FS_NEEDSFSCK); sblock.fs_pendingblocks = 0; sblock.fs_pendinginodes = 0; } sbdirty(); ofsmodified = fsmodified; flush(fswritefd, &sblk); fsmodified = ofsmodified; if (!preen) { printf("\n***** FILE SYSTEM MARKED %s *****\n", markclean ? "CLEAN" : "DIRTY"); if (!markclean) rerun = 1; } } else if (!preen) { if (markclean) { printf("\n***** FILE SYSTEM IS CLEAN *****\n"); } else { printf("\n***** FILE SYSTEM STILL DIRTY *****\n"); rerun = 1; } } (void)close(fsreadfd); (void)close(fswritefd); } /* * Print out I/O statistics. */ void IOstats(char *what) { int i; if (debug == 0) return; if (diskreads == 0) { printf("%s: no I/O\n\n", what); return; } if (startpass.tv_sec == 0) startpass = startprog; printf("%s: I/O statistics\n", what); printIOstats(); totaldiskreads += diskreads; diskreads = 0; for (i = 0; i < BT_NUMBUFTYPES; i++) { timespecadd(&totalreadtime[i], &readtime[i]); totalreadcnt[i] += readcnt[i]; readtime[i].tv_sec = readtime[i].tv_nsec = 0; readcnt[i] = 0; } clock_gettime(CLOCK_REALTIME_PRECISE, &startpass); } void finalIOstats(void) { int i; if (debug == 0) return; printf("Final I/O statistics\n"); totaldiskreads += diskreads; diskreads = totaldiskreads; startpass = startprog; for (i = 0; i < BT_NUMBUFTYPES; i++) { timespecadd(&totalreadtime[i], &readtime[i]); totalreadcnt[i] += readcnt[i]; readtime[i] = totalreadtime[i]; readcnt[i] = totalreadcnt[i]; } printIOstats(); } static void printIOstats(void) { long long msec, totalmsec; int i; clock_gettime(CLOCK_REALTIME_PRECISE, &finishpass); timespecsub(&finishpass, &startpass); printf("Running time: %jd.%03ld sec\n", (intmax_t)finishpass.tv_sec, finishpass.tv_nsec / 1000000); printf("buffer reads by type:\n"); for (totalmsec = 0, i = 0; i < BT_NUMBUFTYPES; i++) totalmsec += readtime[i].tv_sec * 1000 + readtime[i].tv_nsec / 1000000; if (totalmsec == 0) totalmsec = 1; for (i = 0; i < BT_NUMBUFTYPES; i++) { if (readcnt[i] == 0) continue; msec = readtime[i].tv_sec * 1000 + readtime[i].tv_nsec / 1000000; printf("%21s:%8ld %2ld.%ld%% %4jd.%03ld sec %2lld.%lld%%\n", buftype[i], readcnt[i], readcnt[i] * 100 / diskreads, (readcnt[i] * 1000 / diskreads) % 10, (intmax_t)readtime[i].tv_sec, readtime[i].tv_nsec / 1000000, msec * 100 / totalmsec, (msec * 1000 / totalmsec) % 10); } printf("\n"); } int blread(int fd, char *buf, ufs2_daddr_t blk, long size) { char *cp; int i, errs; off_t offset; offset = blk; offset *= dev_bsize; if (bkgrdflag) slowio_start(); totalreads++; diskreads++; if (lseek(fd, offset, 0) < 0) rwerror("SEEK BLK", blk); else if (read(fd, buf, (int)size) == size) { if (bkgrdflag) slowio_end(); return (0); } /* * This is handled specially here instead of in rwerror because * rwerror is used for all sorts of errors, not just true read/write * errors. It should be refactored and fixed. */ if (surrender) { pfatal("CANNOT READ_BLK: %ld", (long)blk); errx(EEXIT, "ABORTING DUE TO READ ERRORS"); } else rwerror("READ BLK", blk); if (lseek(fd, offset, 0) < 0) rwerror("SEEK BLK", blk); errs = 0; memset(buf, 0, (size_t)size); printf("THE FOLLOWING DISK SECTORS COULD NOT BE READ:"); for (cp = buf, i = 0; i < size; i += secsize, cp += secsize) { if (read(fd, cp, (int)secsize) != secsize) { (void)lseek(fd, offset + i + secsize, 0); if (secsize != dev_bsize && dev_bsize != 1) printf(" %jd (%jd),", (intmax_t)(blk * dev_bsize + i) / secsize, (intmax_t)blk + i / dev_bsize); else printf(" %jd,", (intmax_t)blk + i / dev_bsize); errs++; } } printf("\n"); if (errs) resolved = 0; return (errs); } void blwrite(int fd, char *buf, ufs2_daddr_t blk, ssize_t size) { int i; char *cp; off_t offset; if (fd < 0) return; offset = blk; offset *= dev_bsize; if (lseek(fd, offset, 0) < 0) rwerror("SEEK BLK", blk); else if (write(fd, buf, size) == size) { fsmodified = 1; return; } resolved = 0; rwerror("WRITE BLK", blk); if (lseek(fd, offset, 0) < 0) rwerror("SEEK BLK", blk); printf("THE FOLLOWING SECTORS COULD NOT BE WRITTEN:"); for (cp = buf, i = 0; i < size; i += dev_bsize, cp += dev_bsize) if (write(fd, cp, dev_bsize) != dev_bsize) { (void)lseek(fd, offset + i + dev_bsize, 0); printf(" %jd,", (intmax_t)blk + i / dev_bsize); } printf("\n"); return; } void blerase(int fd, ufs2_daddr_t blk, long size) { off_t ioarg[2]; if (fd < 0) return; ioarg[0] = blk * dev_bsize; ioarg[1] = size; ioctl(fd, DIOCGDELETE, ioarg); /* we don't really care if we succeed or not */ return; } /* * Fill a contiguous region with all-zeroes. Note ZEROBUFSIZE is by * definition a multiple of dev_bsize. */ void blzero(int fd, ufs2_daddr_t blk, long size) { static char *zero; off_t offset, len; if (fd < 0) return; if (zero == NULL) { zero = calloc(ZEROBUFSIZE, 1); if (zero == NULL) errx(EEXIT, "cannot allocate buffer pool"); } offset = blk * dev_bsize; if (lseek(fd, offset, 0) < 0) rwerror("SEEK BLK", blk); while (size > 0) { len = MIN(ZEROBUFSIZE, size); if (write(fd, zero, len) != len) rwerror("WRITE BLK", blk); blk += len / dev_bsize; size -= len; } } /* * Verify cylinder group's magic number and other parameters. If the * test fails, offer an option to rebuild the whole cylinder group. */ int check_cgmagic(int cg, struct bufarea *cgbp) { struct cg *cgp = cgbp->b_un.b_cg; /* * Extended cylinder group checks. */ if (cg_chkmagic(cgp) && ((sblock.fs_magic == FS_UFS1_MAGIC && cgp->cg_old_niblk == sblock.fs_ipg && cgp->cg_ndblk <= sblock.fs_fpg && cgp->cg_old_ncyl <= sblock.fs_old_cpg) || (sblock.fs_magic == FS_UFS2_MAGIC && cgp->cg_niblk == sblock.fs_ipg && cgp->cg_ndblk <= sblock.fs_fpg && cgp->cg_initediblk <= sblock.fs_ipg))) { return (1); } pfatal("CYLINDER GROUP %d: BAD MAGIC NUMBER", cg); if (!reply("REBUILD CYLINDER GROUP")) { printf("YOU WILL NEED TO RERUN FSCK.\n"); rerun = 1; return (1); } /* * Zero out the cylinder group and then initialize critical fields. * Bit maps and summaries will be recalculated by later passes. */ memset(cgp, 0, (size_t)sblock.fs_cgsize); cgp->cg_magic = CG_MAGIC; cgp->cg_cgx = cg; cgp->cg_niblk = sblock.fs_ipg; cgp->cg_initediblk = MIN(sblock.fs_ipg, 2 * INOPB(&sblock)); if (cgbase(&sblock, cg) + sblock.fs_fpg < sblock.fs_size) cgp->cg_ndblk = sblock.fs_fpg; else cgp->cg_ndblk = sblock.fs_size - cgbase(&sblock, cg); cgp->cg_iusedoff = &cgp->cg_space[0] - (u_char *)(&cgp->cg_firstfield); if (sblock.fs_magic == FS_UFS1_MAGIC) { cgp->cg_niblk = 0; cgp->cg_initediblk = 0; cgp->cg_old_ncyl = sblock.fs_old_cpg; cgp->cg_old_niblk = sblock.fs_ipg; cgp->cg_old_btotoff = cgp->cg_iusedoff; cgp->cg_old_boff = cgp->cg_old_btotoff + sblock.fs_old_cpg * sizeof(int32_t); cgp->cg_iusedoff = cgp->cg_old_boff + sblock.fs_old_cpg * sizeof(u_int16_t); } cgp->cg_freeoff = cgp->cg_iusedoff + howmany(sblock.fs_ipg, CHAR_BIT); cgp->cg_nextfreeoff = cgp->cg_freeoff + howmany(sblock.fs_fpg,CHAR_BIT); if (sblock.fs_contigsumsize > 0) { cgp->cg_nclusterblks = cgp->cg_ndblk / sblock.fs_frag; cgp->cg_clustersumoff = roundup(cgp->cg_nextfreeoff, sizeof(u_int32_t)); cgp->cg_clustersumoff -= sizeof(u_int32_t); cgp->cg_clusteroff = cgp->cg_clustersumoff + (sblock.fs_contigsumsize + 1) * sizeof(u_int32_t); cgp->cg_nextfreeoff = cgp->cg_clusteroff + howmany(fragstoblks(&sblock, sblock.fs_fpg), CHAR_BIT); } dirty(cgbp); return (0); } /* * allocate a data block with the specified number of fragments */ ufs2_daddr_t allocblk(long frags) { int i, j, k, cg, baseblk; struct bufarea *cgbp; struct cg *cgp; if (frags <= 0 || frags > sblock.fs_frag) return (0); for (i = 0; i < maxfsblock - sblock.fs_frag; i += sblock.fs_frag) { for (j = 0; j <= sblock.fs_frag - frags; j++) { if (testbmap(i + j)) continue; for (k = 1; k < frags; k++) if (testbmap(i + j + k)) break; if (k < frags) { j += k; continue; } cg = dtog(&sblock, i + j); cgbp = cgget(cg); cgp = cgbp->b_un.b_cg; if (!check_cgmagic(cg, cgbp)) return (0); baseblk = dtogd(&sblock, i + j); for (k = 0; k < frags; k++) { setbmap(i + j + k); clrbit(cg_blksfree(cgp), baseblk + k); } n_blks += frags; if (frags == sblock.fs_frag) cgp->cg_cs.cs_nbfree--; else cgp->cg_cs.cs_nffree -= frags; dirty(cgbp); return (i + j); } } return (0); } /* * Free a previously allocated block */ void freeblk(ufs2_daddr_t blkno, long frags) { struct inodesc idesc; idesc.id_blkno = blkno; idesc.id_numfrags = frags; (void)pass4check(&idesc); } /* Slow down IO so as to leave some disk bandwidth for other processes */ void slowio_start() { /* Delay one in every 8 operations */ slowio_pollcnt = (slowio_pollcnt + 1) & 7; if (slowio_pollcnt == 0) { gettimeofday(&slowio_starttime, NULL); } } void slowio_end() { struct timeval tv; int delay_usec; if (slowio_pollcnt != 0) return; /* Update the slowdown interval. */ gettimeofday(&tv, NULL); delay_usec = (tv.tv_sec - slowio_starttime.tv_sec) * 1000000 + (tv.tv_usec - slowio_starttime.tv_usec); if (delay_usec < 64) delay_usec = 64; if (delay_usec > 2500000) delay_usec = 2500000; slowio_delay_usec = (slowio_delay_usec * 63 + delay_usec) >> 6; /* delay by 8 times the average IO delay */ if (slowio_delay_usec > 64) usleep(slowio_delay_usec * 8); } /* * Find a pathname */ void getpathname(char *namebuf, ino_t curdir, ino_t ino) { int len; char *cp; struct inodesc idesc; static int busy = 0; - if (curdir == ino && ino == ROOTINO) { + if (curdir == ino && ino == UFS_ROOTINO) { (void)strcpy(namebuf, "/"); return; } if (busy || !INO_IS_DVALID(curdir)) { (void)strcpy(namebuf, "?"); return; } busy = 1; memset(&idesc, 0, sizeof(struct inodesc)); idesc.id_type = DATA; idesc.id_fix = IGNORE; cp = &namebuf[MAXPATHLEN - 1]; *cp = '\0'; if (curdir != ino) { idesc.id_parent = curdir; goto namelookup; } - while (ino != ROOTINO) { + while (ino != UFS_ROOTINO) { idesc.id_number = ino; idesc.id_func = findino; idesc.id_name = strdup(".."); if ((ckinode(ginode(ino), &idesc) & FOUND) == 0) break; namelookup: idesc.id_number = idesc.id_parent; idesc.id_parent = ino; idesc.id_func = findname; idesc.id_name = namebuf; if ((ckinode(ginode(idesc.id_number), &idesc)&FOUND) == 0) break; len = strlen(namebuf); cp -= len; memmove(cp, namebuf, (size_t)len); *--cp = '/'; if (cp < &namebuf[UFS_MAXNAMLEN]) break; ino = idesc.id_number; } busy = 0; - if (ino != ROOTINO) + if (ino != UFS_ROOTINO) *--cp = '?'; memmove(namebuf, cp, (size_t)(&namebuf[MAXPATHLEN] - cp)); } void catch(int sig __unused) { ckfini(0); exit(12); } /* * When preening, allow a single quit to signal * a special exit after file system checks complete * so that reboot sequence may be interrupted. */ void catchquit(int sig __unused) { printf("returning to single-user after file system check\n"); returntosingle = 1; (void)signal(SIGQUIT, SIG_DFL); } /* * determine whether an inode should be fixed. */ int dofix(struct inodesc *idesc, const char *msg) { switch (idesc->id_fix) { case DONTKNOW: if (idesc->id_type == DATA) direrror(idesc->id_number, msg); else pwarn("%s", msg); if (preen) { printf(" (SALVAGED)\n"); idesc->id_fix = FIX; return (ALTERED); } if (reply("SALVAGE") == 0) { idesc->id_fix = NOFIX; return (0); } idesc->id_fix = FIX; return (ALTERED); case FIX: return (ALTERED); case NOFIX: case IGNORE: return (0); default: errx(EEXIT, "UNKNOWN INODESC FIX MODE %d", idesc->id_fix); } /* NOTREACHED */ return (0); } #include /* * An unexpected inconsistency occurred. * Die if preening or file system is running with soft dependency protocol, * otherwise just print message and continue. */ void pfatal(const char *fmt, ...) { va_list ap; va_start(ap, fmt); if (!preen) { (void)vfprintf(stdout, fmt, ap); va_end(ap); if (usedsoftdep) (void)fprintf(stdout, "\nUNEXPECTED SOFT UPDATE INCONSISTENCY\n"); /* * Force foreground fsck to clean up inconsistency. */ if (bkgrdflag) { cmd.value = FS_NEEDSFSCK; cmd.size = 1; if (sysctlbyname("vfs.ffs.setflags", 0, 0, &cmd, sizeof cmd) == -1) pwarn("CANNOT SET FS_NEEDSFSCK FLAG\n"); fprintf(stdout, "CANNOT RUN IN BACKGROUND\n"); ckfini(0); exit(EEXIT); } return; } if (cdevname == NULL) cdevname = strdup("fsck"); (void)fprintf(stdout, "%s: ", cdevname); (void)vfprintf(stdout, fmt, ap); (void)fprintf(stdout, "\n%s: UNEXPECTED%sINCONSISTENCY; RUN fsck MANUALLY.\n", cdevname, usedsoftdep ? " SOFT UPDATE " : " "); /* * Force foreground fsck to clean up inconsistency. */ if (bkgrdflag) { cmd.value = FS_NEEDSFSCK; cmd.size = 1; if (sysctlbyname("vfs.ffs.setflags", 0, 0, &cmd, sizeof cmd) == -1) pwarn("CANNOT SET FS_NEEDSFSCK FLAG\n"); } ckfini(0); exit(EEXIT); } /* * Pwarn just prints a message when not preening or running soft dependency * protocol, or a warning (preceded by filename) when preening. */ void pwarn(const char *fmt, ...) { va_list ap; va_start(ap, fmt); if (preen) (void)fprintf(stdout, "%s: ", cdevname); (void)vfprintf(stdout, fmt, ap); va_end(ap); } /* * Stub for routines from kernel. */ void panic(const char *fmt, ...) { va_list ap; va_start(ap, fmt); pfatal("INTERNAL INCONSISTENCY:"); (void)vfprintf(stdout, fmt, ap); va_end(ap); exit(EEXIT); } Index: head/sbin/fsck_ffs/gjournal.c =================================================================== --- head/sbin/fsck_ffs/gjournal.c (revision 313779) +++ head/sbin/fsck_ffs/gjournal.c (revision 313780) @@ -1,510 +1,510 @@ /*- * Copyright (c) 2006 Pawel Jakub Dawidek * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * Copyright (c) 1982, 1986, 1989, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "fsck.h" struct cgchain { union { struct cg cgcu_cg; char cgcu_buf[MAXBSIZE]; } cgc_union; int cgc_busy; int cgc_dirty; LIST_ENTRY(cgchain) cgc_next; }; #define cgc_cg cgc_union.cgcu_cg #define MAX_CACHED_CGS 1024 static unsigned ncgs = 0; static LIST_HEAD(, cgchain) cglist = LIST_HEAD_INITIALIZER(cglist); static const char *devnam; static struct uufsd *disk = NULL; static struct fs *fs = NULL; struct ufs2_dinode ufs2_zino; static void putcgs(void); /* * Return cylinder group from the cache or load it if it is not in the * cache yet. * Don't cache more than MAX_CACHED_CGS cylinder groups. */ static struct cgchain * getcg(int cg) { struct cgchain *cgc; assert(disk != NULL && fs != NULL); LIST_FOREACH(cgc, &cglist, cgc_next) { if (cgc->cgc_cg.cg_cgx == cg) { //printf("%s: Found cg=%d\n", __func__, cg); return (cgc); } } /* * Our cache is full? Let's clean it up. */ if (ncgs >= MAX_CACHED_CGS) { //printf("%s: Flushing CGs.\n", __func__); putcgs(); } cgc = malloc(sizeof(*cgc)); if (cgc == NULL) { /* * Cannot allocate memory? * Let's put all currently loaded and not busy cylinder groups * on disk and try again. */ //printf("%s: No memory, flushing CGs.\n", __func__); putcgs(); cgc = malloc(sizeof(*cgc)); if (cgc == NULL) err(1, "malloc(%zu)", sizeof(*cgc)); } if (cgread1(disk, cg) == -1) err(1, "cgread1(%d)", cg); bcopy(&disk->d_cg, &cgc->cgc_cg, sizeof(cgc->cgc_union)); cgc->cgc_busy = 0; cgc->cgc_dirty = 0; LIST_INSERT_HEAD(&cglist, cgc, cgc_next); ncgs++; //printf("%s: Read cg=%d\n", __func__, cg); return (cgc); } /* * Mark cylinder group as dirty - it will be written back on putcgs(). */ static void dirtycg(struct cgchain *cgc) { cgc->cgc_dirty = 1; } /* * Mark cylinder group as busy - it will not be freed on putcgs(). */ static void busycg(struct cgchain *cgc) { cgc->cgc_busy = 1; } /* * Unmark the given cylinder group as busy. */ static void unbusycg(struct cgchain *cgc) { cgc->cgc_busy = 0; } /* * Write back all dirty cylinder groups. * Free all non-busy cylinder groups. */ static void putcgs(void) { struct cgchain *cgc, *cgc2; assert(disk != NULL && fs != NULL); LIST_FOREACH_SAFE(cgc, &cglist, cgc_next, cgc2) { if (cgc->cgc_busy) continue; LIST_REMOVE(cgc, cgc_next); ncgs--; if (cgc->cgc_dirty) { bcopy(&cgc->cgc_cg, &disk->d_cg, sizeof(cgc->cgc_union)); if (cgwrite1(disk, cgc->cgc_cg.cg_cgx) == -1) err(1, "cgwrite1(%d)", cgc->cgc_cg.cg_cgx); //printf("%s: Wrote cg=%d\n", __func__, // cgc->cgc_cg.cg_cgx); } free(cgc); } } #if 0 /* * Free all non-busy cylinder groups without storing the dirty ones. */ static void cancelcgs(void) { struct cgchain *cgc; assert(disk != NULL && fs != NULL); while ((cgc = LIST_FIRST(&cglist)) != NULL) { if (cgc->cgc_busy) continue; LIST_REMOVE(cgc, cgc_next); //printf("%s: Canceled cg=%d\n", __func__, cgc->cgc_cg.cg_cgx); free(cgc); } } #endif /* * Open the given provider, load superblock. */ static void opendisk(void) { if (disk != NULL) return; disk = malloc(sizeof(*disk)); if (disk == NULL) err(1, "malloc(%zu)", sizeof(*disk)); if (ufs_disk_fillout(disk, devnam) == -1) { err(1, "ufs_disk_fillout(%s) failed: %s", devnam, disk->d_error); } fs = &disk->d_fs; } /* * Mark file system as clean, write the super-block back, close the disk. */ static void closedisk(void) { fs->fs_clean = 1; if (sbwrite(disk, 0) == -1) err(1, "sbwrite(%s)", devnam); if (ufs_disk_close(disk) == -1) err(1, "ufs_disk_close(%s)", devnam); free(disk); disk = NULL; fs = NULL; } static void blkfree(ufs2_daddr_t bno, long size) { struct cgchain *cgc; struct cg *cgp; ufs1_daddr_t fragno, cgbno; int i, cg, blk, frags, bbase; u_int8_t *blksfree; cg = dtog(fs, bno); cgc = getcg(cg); dirtycg(cgc); cgp = &cgc->cgc_cg; cgbno = dtogd(fs, bno); blksfree = cg_blksfree(cgp); if (size == fs->fs_bsize) { fragno = fragstoblks(fs, cgbno); if (!ffs_isfreeblock(fs, blksfree, fragno)) assert(!"blkfree: freeing free block"); ffs_setblock(fs, blksfree, fragno); ffs_clusteracct(fs, cgp, fragno, 1); cgp->cg_cs.cs_nbfree++; fs->fs_cstotal.cs_nbfree++; fs->fs_cs(fs, cg).cs_nbfree++; } else { bbase = cgbno - fragnum(fs, cgbno); /* * decrement the counts associated with the old frags */ blk = blkmap(fs, blksfree, bbase); ffs_fragacct(fs, blk, cgp->cg_frsum, -1); /* * deallocate the fragment */ frags = numfrags(fs, size); for (i = 0; i < frags; i++) { if (isset(blksfree, cgbno + i)) assert(!"blkfree: freeing free frag"); setbit(blksfree, cgbno + i); } cgp->cg_cs.cs_nffree += i; fs->fs_cstotal.cs_nffree += i; fs->fs_cs(fs, cg).cs_nffree += i; /* * add back in counts associated with the new frags */ blk = blkmap(fs, blksfree, bbase); ffs_fragacct(fs, blk, cgp->cg_frsum, 1); /* * if a complete block has been reassembled, account for it */ fragno = fragstoblks(fs, bbase); if (ffs_isblock(fs, blksfree, fragno)) { cgp->cg_cs.cs_nffree -= fs->fs_frag; fs->fs_cstotal.cs_nffree -= fs->fs_frag; fs->fs_cs(fs, cg).cs_nffree -= fs->fs_frag; ffs_clusteracct(fs, cgp, fragno, 1); cgp->cg_cs.cs_nbfree++; fs->fs_cstotal.cs_nbfree++; fs->fs_cs(fs, cg).cs_nbfree++; } } } /* * Recursively free all indirect blocks. */ static void freeindir(ufs2_daddr_t blk, int level) { char sblks[MAXBSIZE]; ufs2_daddr_t *blks; int i; if (bread(disk, fsbtodb(fs, blk), (void *)&sblks, (size_t)fs->fs_bsize) == -1) err(1, "bread: %s", disk->d_error); blks = (ufs2_daddr_t *)&sblks; for (i = 0; i < NINDIR(fs); i++) { if (blks[i] == 0) break; if (level == 0) blkfree(blks[i], fs->fs_bsize); else freeindir(blks[i], level - 1); } blkfree(blk, fs->fs_bsize); } #define dblksize(fs, dino, lbn) \ ((dino)->di_size >= smalllblktosize(fs, (lbn) + 1) \ ? (fs)->fs_bsize \ : fragroundup(fs, blkoff(fs, (dino)->di_size))) /* * Free all blocks associated with the given inode. */ static void clear_inode(struct ufs2_dinode *dino) { ufs2_daddr_t bn; int extblocks, i, level; off_t osize; long bsize; extblocks = 0; if (fs->fs_magic == FS_UFS2_MAGIC && dino->di_extsize > 0) extblocks = btodb(fragroundup(fs, dino->di_extsize)); /* deallocate external attributes blocks */ if (extblocks > 0) { osize = dino->di_extsize; dino->di_blocks -= extblocks; dino->di_extsize = 0; - for (i = 0; i < NXADDR; i++) { + for (i = 0; i < UFS_NXADDR; i++) { if (dino->di_extb[i] == 0) continue; blkfree(dino->di_extb[i], sblksize(fs, osize, i)); } } #define SINGLE 0 /* index of single indirect block */ #define DOUBLE 1 /* index of double indirect block */ #define TRIPLE 2 /* index of triple indirect block */ /* deallocate indirect blocks */ for (level = SINGLE; level <= TRIPLE; level++) { if (dino->di_ib[level] == 0) break; freeindir(dino->di_ib[level], level); } /* deallocate direct blocks and fragments */ - for (i = 0; i < NDADDR; i++) { + for (i = 0; i < UFS_NDADDR; i++) { bn = dino->di_db[i]; if (bn == 0) continue; bsize = dblksize(fs, dino, i); blkfree(bn, bsize); } } void gjournal_check(const char *filesys) { struct ufs2_dinode *dino; void *p; struct cgchain *cgc; struct cg *cgp; uint8_t *inosused; ino_t cino, ino; int cg, mode; devnam = filesys; opendisk(); /* Are there any unreferenced inodes in this file system? */ if (fs->fs_unrefs == 0) { //printf("No unreferenced inodes.\n"); closedisk(); return; } for (cg = 0; cg < fs->fs_ncg; cg++) { /* Show progress if requested. */ if (got_siginfo) { printf("%s: phase j: cyl group %d of %d (%d%%)\n", cdevname, cg, fs->fs_ncg, cg * 100 / fs->fs_ncg); got_siginfo = 0; } if (got_sigalarm) { setproctitle("%s pj %d%%", cdevname, cg * 100 / fs->fs_ncg); got_sigalarm = 0; } cgc = getcg(cg); cgp = &cgc->cgc_cg; /* Are there any unreferenced inodes in this cylinder group? */ if (cgp->cg_unrefs == 0) continue; //printf("Analizing cylinder group %d (count=%d)\n", cg, cgp->cg_unrefs); /* * We are going to modify this cylinder group, so we want it to * be written back. */ dirtycg(cgc); /* We don't want it to be freed in the meantime. */ busycg(cgc); inosused = cg_inosused(cgp); /* * Now go through the list of all inodes in this cylinder group * to find unreferenced ones. */ for (cino = 0; cino < fs->fs_ipg; cino++) { ino = fs->fs_ipg * cg + cino; /* Unallocated? Skip it. */ if (isclr(inosused, cino)) continue; if (getino(disk, &p, ino, &mode) == -1) err(1, "getino(cg=%d ino=%ju)", cg, (uintmax_t)ino); dino = p; /* Not a regular file nor directory? Skip it. */ if (!S_ISREG(dino->di_mode) && !S_ISDIR(dino->di_mode)) continue; /* Has reference(s)? Skip it. */ if (dino->di_nlink > 0) continue; //printf("Clearing inode=%d (size=%jd)\n", ino, (intmax_t)dino->di_size); /* Free inode's blocks. */ clear_inode(dino); /* Deallocate it. */ clrbit(inosused, cino); /* Update position of last used inode. */ if (ino < cgp->cg_irotor) cgp->cg_irotor = ino; /* Update statistics. */ cgp->cg_cs.cs_nifree++; fs->fs_cs(fs, cg).cs_nifree++; fs->fs_cstotal.cs_nifree++; cgp->cg_unrefs--; fs->fs_unrefs--; /* If this is directory, update related statistics. */ if (S_ISDIR(dino->di_mode)) { cgp->cg_cs.cs_ndir--; fs->fs_cs(fs, cg).cs_ndir--; fs->fs_cstotal.cs_ndir--; } /* Zero-fill the inode. */ *dino = ufs2_zino; /* Write the inode back. */ if (putino(disk) == -1) err(1, "putino(cg=%d ino=%ju)", cg, (uintmax_t)ino); if (cgp->cg_unrefs == 0) { //printf("No more unreferenced inodes in cg=%d.\n", cg); break; } } /* * We don't need this cylinder group anymore, so feel free to * free it if needed. */ unbusycg(cgc); /* * If there are no more unreferenced inodes, there is no need to * check other cylinder groups. */ if (fs->fs_unrefs == 0) { //printf("No more unreferenced inodes (cg=%d/%d).\n", cg, // fs->fs_ncg); break; } } /* Write back modified cylinder groups. */ putcgs(); /* Write back updated statistics and super-block. */ closedisk(); } Index: head/sbin/fsck_ffs/inode.c =================================================================== --- head/sbin/fsck_ffs/inode.c (revision 313779) +++ head/sbin/fsck_ffs/inode.c (revision 313780) @@ -1,734 +1,734 @@ /* * Copyright (c) 1980, 1986, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #if 0 #ifndef lint static const char sccsid[] = "@(#)inode.c 8.8 (Berkeley) 4/28/95"; #endif /* not lint */ #endif #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include "fsck.h" static ino_t startinum; static int iblock(struct inodesc *, long ilevel, off_t isize, int type); int ckinode(union dinode *dp, struct inodesc *idesc) { off_t remsize, sizepb; int i, offset, ret; union dinode dino; ufs2_daddr_t ndb; mode_t mode; char pathbuf[MAXPATHLEN + 1]; if (idesc->id_fix != IGNORE) idesc->id_fix = DONTKNOW; idesc->id_lbn = -1; idesc->id_entryno = 0; idesc->id_filesize = DIP(dp, di_size); mode = DIP(dp, di_mode) & IFMT; if (mode == IFBLK || mode == IFCHR || (mode == IFLNK && DIP(dp, di_size) < (unsigned)sblock.fs_maxsymlinklen)) return (KEEPON); if (sblock.fs_magic == FS_UFS1_MAGIC) dino.dp1 = dp->dp1; else dino.dp2 = dp->dp2; ndb = howmany(DIP(&dino, di_size), sblock.fs_bsize); - for (i = 0; i < NDADDR; i++) { + for (i = 0; i < UFS_NDADDR; i++) { idesc->id_lbn++; if (--ndb == 0 && (offset = blkoff(&sblock, DIP(&dino, di_size))) != 0) idesc->id_numfrags = numfrags(&sblock, fragroundup(&sblock, offset)); else idesc->id_numfrags = sblock.fs_frag; if (DIP(&dino, di_db[i]) == 0) { if (idesc->id_type == DATA && ndb >= 0) { /* An empty block in a directory XXX */ getpathname(pathbuf, idesc->id_number, idesc->id_number); pfatal("DIRECTORY %s: CONTAINS EMPTY BLOCKS", pathbuf); if (reply("ADJUST LENGTH") == 1) { dp = ginode(idesc->id_number); DIP_SET(dp, di_size, i * sblock.fs_bsize); printf( "YOU MUST RERUN FSCK AFTERWARDS\n"); rerun = 1; inodirty(); } } continue; } idesc->id_blkno = DIP(&dino, di_db[i]); if (idesc->id_type != DATA) ret = (*idesc->id_func)(idesc); else ret = dirscan(idesc); if (ret & STOP) return (ret); } idesc->id_numfrags = sblock.fs_frag; - remsize = DIP(&dino, di_size) - sblock.fs_bsize * NDADDR; + remsize = DIP(&dino, di_size) - sblock.fs_bsize * UFS_NDADDR; sizepb = sblock.fs_bsize; - for (i = 0; i < NIADDR; i++) { + for (i = 0; i < UFS_NIADDR; i++) { sizepb *= NINDIR(&sblock); if (DIP(&dino, di_ib[i])) { idesc->id_blkno = DIP(&dino, di_ib[i]); ret = iblock(idesc, i + 1, remsize, BT_LEVEL1 + i); if (ret & STOP) return (ret); } else { idesc->id_lbn += sizepb / sblock.fs_bsize; if (idesc->id_type == DATA && remsize > 0) { /* An empty block in a directory XXX */ getpathname(pathbuf, idesc->id_number, idesc->id_number); pfatal("DIRECTORY %s: CONTAINS EMPTY BLOCKS", pathbuf); if (reply("ADJUST LENGTH") == 1) { dp = ginode(idesc->id_number); DIP_SET(dp, di_size, DIP(dp, di_size) - remsize); remsize = 0; printf( "YOU MUST RERUN FSCK AFTERWARDS\n"); rerun = 1; inodirty(); break; } } } remsize -= sizepb; } return (KEEPON); } static int iblock(struct inodesc *idesc, long ilevel, off_t isize, int type) { struct bufarea *bp; int i, n, (*func)(struct inodesc *), nif; off_t sizepb; char buf[BUFSIZ]; char pathbuf[MAXPATHLEN + 1]; union dinode *dp; if (idesc->id_type != DATA) { func = idesc->id_func; if (((n = (*func)(idesc)) & KEEPON) == 0) return (n); } else func = dirscan; if (chkrange(idesc->id_blkno, idesc->id_numfrags)) return (SKIP); bp = getdatablk(idesc->id_blkno, sblock.fs_bsize, type); ilevel--; for (sizepb = sblock.fs_bsize, i = 0; i < ilevel; i++) sizepb *= NINDIR(&sblock); if (howmany(isize, sizepb) > NINDIR(&sblock)) nif = NINDIR(&sblock); else nif = howmany(isize, sizepb); if (idesc->id_func == pass1check && nif < NINDIR(&sblock)) { for (i = nif; i < NINDIR(&sblock); i++) { if (IBLK(bp, i) == 0) continue; (void)sprintf(buf, "PARTIALLY TRUNCATED INODE I=%lu", (u_long)idesc->id_number); if (preen) { pfatal("%s", buf); } else if (dofix(idesc, buf)) { IBLK_SET(bp, i, 0); dirty(bp); } } flush(fswritefd, bp); } for (i = 0; i < nif; i++) { if (ilevel == 0) idesc->id_lbn++; if (IBLK(bp, i)) { idesc->id_blkno = IBLK(bp, i); if (ilevel == 0) n = (*func)(idesc); else n = iblock(idesc, ilevel, isize, type); if (n & STOP) { bp->b_flags &= ~B_INUSE; return (n); } } else { if (idesc->id_type == DATA && isize > 0) { /* An empty block in a directory XXX */ getpathname(pathbuf, idesc->id_number, idesc->id_number); pfatal("DIRECTORY %s: CONTAINS EMPTY BLOCKS", pathbuf); if (reply("ADJUST LENGTH") == 1) { dp = ginode(idesc->id_number); DIP_SET(dp, di_size, DIP(dp, di_size) - isize); isize = 0; printf( "YOU MUST RERUN FSCK AFTERWARDS\n"); rerun = 1; inodirty(); bp->b_flags &= ~B_INUSE; return(STOP); } } } isize -= sizepb; } bp->b_flags &= ~B_INUSE; return (KEEPON); } /* * Check that a block in a legal block number. * Return 0 if in range, 1 if out of range. */ int chkrange(ufs2_daddr_t blk, int cnt) { int c; if (cnt <= 0 || blk <= 0 || blk > maxfsblock || cnt - 1 > maxfsblock - blk) return (1); if (cnt > sblock.fs_frag || fragnum(&sblock, blk) + cnt > sblock.fs_frag) { if (debug) printf("bad size: blk %ld, offset %i, size %d\n", (long)blk, (int)fragnum(&sblock, blk), cnt); return (1); } c = dtog(&sblock, blk); if (blk < cgdmin(&sblock, c)) { if ((blk + cnt) > cgsblock(&sblock, c)) { if (debug) { printf("blk %ld < cgdmin %ld;", (long)blk, (long)cgdmin(&sblock, c)); printf(" blk + cnt %ld > cgsbase %ld\n", (long)(blk + cnt), (long)cgsblock(&sblock, c)); } return (1); } } else { if ((blk + cnt) > cgbase(&sblock, c+1)) { if (debug) { printf("blk %ld >= cgdmin %ld;", (long)blk, (long)cgdmin(&sblock, c)); printf(" blk + cnt %ld > sblock.fs_fpg %ld\n", (long)(blk + cnt), (long)sblock.fs_fpg); } return (1); } } return (0); } /* * General purpose interface for reading inodes. */ union dinode * ginode(ino_t inumber) { ufs2_daddr_t iblk; - if (inumber < ROOTINO || inumber > maxino) + if (inumber < UFS_ROOTINO || inumber > maxino) errx(EEXIT, "bad inode number %ju to ginode", (uintmax_t)inumber); if (startinum == 0 || inumber < startinum || inumber >= startinum + INOPB(&sblock)) { iblk = ino_to_fsba(&sblock, inumber); if (pbp != NULL) pbp->b_flags &= ~B_INUSE; pbp = getdatablk(iblk, sblock.fs_bsize, BT_INODES); startinum = rounddown(inumber, INOPB(&sblock)); } if (sblock.fs_magic == FS_UFS1_MAGIC) return ((union dinode *) &pbp->b_un.b_dinode1[inumber % INOPB(&sblock)]); return ((union dinode *)&pbp->b_un.b_dinode2[inumber % INOPB(&sblock)]); } /* * Special purpose version of ginode used to optimize first pass * over all the inodes in numerical order. */ static ino_t nextino, lastinum, lastvalidinum; static long readcount, readpercg, fullcnt, inobufsize, partialcnt, partialsize; static struct bufarea inobuf; union dinode * getnextinode(ino_t inumber, int rebuildcg) { int j; long size; mode_t mode; ufs2_daddr_t ndb, blk; union dinode *dp; static caddr_t nextinop; if (inumber != nextino++ || inumber > lastvalidinum) errx(EEXIT, "bad inode number %ju to nextinode", (uintmax_t)inumber); if (inumber >= lastinum) { readcount++; blk = ino_to_fsba(&sblock, lastinum); if (readcount % readpercg == 0) { size = partialsize; lastinum += partialcnt; } else { size = inobufsize; lastinum += fullcnt; } /* * If getblk encounters an error, it will already have zeroed * out the buffer, so we do not need to do so here. */ getblk(&inobuf, blk, size); nextinop = inobuf.b_un.b_buf; } dp = (union dinode *)nextinop; if (rebuildcg && nextinop == inobuf.b_un.b_buf) { /* * Try to determine if we have reached the end of the * allocated inodes. */ mode = DIP(dp, di_mode) & IFMT; if (mode == 0) { if (memcmp(dp->dp2.di_db, ufs2_zino.di_db, - NDADDR * sizeof(ufs2_daddr_t)) || + UFS_NDADDR * sizeof(ufs2_daddr_t)) || memcmp(dp->dp2.di_ib, ufs2_zino.di_ib, - NIADDR * sizeof(ufs2_daddr_t)) || + UFS_NIADDR * sizeof(ufs2_daddr_t)) || dp->dp2.di_mode || dp->dp2.di_size) return (NULL); goto inodegood; } if (!ftypeok(dp)) return (NULL); ndb = howmany(DIP(dp, di_size), sblock.fs_bsize); if (ndb < 0) return (NULL); if (mode == IFBLK || mode == IFCHR) ndb++; if (mode == IFLNK) { /* * Fake ndb value so direct/indirect block checks below * will detect any garbage after symlink string. */ if (DIP(dp, di_size) < (off_t)sblock.fs_maxsymlinklen) { ndb = howmany(DIP(dp, di_size), sizeof(ufs2_daddr_t)); - if (ndb > NDADDR) { - j = ndb - NDADDR; + if (ndb > UFS_NDADDR) { + j = ndb - UFS_NDADDR; for (ndb = 1; j > 1; j--) ndb *= NINDIR(&sblock); - ndb += NDADDR; + ndb += UFS_NDADDR; } } } - for (j = ndb; ndb < NDADDR && j < NDADDR; j++) + for (j = ndb; ndb < UFS_NDADDR && j < UFS_NDADDR; j++) if (DIP(dp, di_db[j]) != 0) return (NULL); - for (j = 0, ndb -= NDADDR; ndb > 0; j++) + for (j = 0, ndb -= UFS_NDADDR; ndb > 0; j++) ndb /= NINDIR(&sblock); - for (; j < NIADDR; j++) + for (; j < UFS_NIADDR; j++) if (DIP(dp, di_ib[j]) != 0) return (NULL); } inodegood: if (sblock.fs_magic == FS_UFS1_MAGIC) nextinop += sizeof(struct ufs1_dinode); else nextinop += sizeof(struct ufs2_dinode); return (dp); } void setinodebuf(ino_t inum) { if (inum % sblock.fs_ipg != 0) errx(EEXIT, "bad inode number %ju to setinodebuf", (uintmax_t)inum); lastvalidinum = inum + sblock.fs_ipg - 1; startinum = 0; nextino = inum; lastinum = inum; readcount = 0; if (inobuf.b_un.b_buf != NULL) return; inobufsize = blkroundup(&sblock, INOBUFSIZE); fullcnt = inobufsize / ((sblock.fs_magic == FS_UFS1_MAGIC) ? sizeof(struct ufs1_dinode) : sizeof(struct ufs2_dinode)); readpercg = sblock.fs_ipg / fullcnt; partialcnt = sblock.fs_ipg % fullcnt; partialsize = partialcnt * ((sblock.fs_magic == FS_UFS1_MAGIC) ? sizeof(struct ufs1_dinode) : sizeof(struct ufs2_dinode)); if (partialcnt != 0) { readpercg++; } else { partialcnt = fullcnt; partialsize = inobufsize; } initbarea(&inobuf, BT_INODES); if ((inobuf.b_un.b_buf = Malloc((unsigned)inobufsize)) == NULL) errx(EEXIT, "cannot allocate space for inode buffer"); } void freeinodebuf(void) { if (inobuf.b_un.b_buf != NULL) free((char *)inobuf.b_un.b_buf); inobuf.b_un.b_buf = NULL; } /* * Routines to maintain information about directory inodes. * This is built during the first pass and used during the * second and third passes. * * Enter inodes into the cache. */ void cacheino(union dinode *dp, ino_t inumber) { struct inoinfo *inp, **inpp; int i, blks; - if (howmany(DIP(dp, di_size), sblock.fs_bsize) > NDADDR) - blks = NDADDR + NIADDR; + if (howmany(DIP(dp, di_size), sblock.fs_bsize) > UFS_NDADDR) + blks = UFS_NDADDR + UFS_NIADDR; else blks = howmany(DIP(dp, di_size), sblock.fs_bsize); inp = (struct inoinfo *) Malloc(sizeof(*inp) + (blks - 1) * sizeof(ufs2_daddr_t)); if (inp == NULL) errx(EEXIT, "cannot increase directory list"); inpp = &inphead[inumber % dirhash]; inp->i_nexthash = *inpp; *inpp = inp; - inp->i_parent = inumber == ROOTINO ? ROOTINO : (ino_t)0; + inp->i_parent = inumber == UFS_ROOTINO ? UFS_ROOTINO : (ino_t)0; inp->i_dotdot = (ino_t)0; inp->i_number = inumber; inp->i_isize = DIP(dp, di_size); inp->i_numblks = blks; - for (i = 0; i < MIN(blks, NDADDR); i++) + for (i = 0; i < MIN(blks, UFS_NDADDR); i++) inp->i_blks[i] = DIP(dp, di_db[i]); - if (blks > NDADDR) - for (i = 0; i < NIADDR; i++) - inp->i_blks[NDADDR + i] = DIP(dp, di_ib[i]); + if (blks > UFS_NDADDR) + for (i = 0; i < UFS_NIADDR; i++) + inp->i_blks[UFS_NDADDR + i] = DIP(dp, di_ib[i]); if (inplast == listmax) { listmax += 100; inpsort = (struct inoinfo **)realloc((char *)inpsort, (unsigned)listmax * sizeof(struct inoinfo *)); if (inpsort == NULL) errx(EEXIT, "cannot increase directory list"); } inpsort[inplast++] = inp; } /* * Look up an inode cache structure. */ struct inoinfo * getinoinfo(ino_t inumber) { struct inoinfo *inp; for (inp = inphead[inumber % dirhash]; inp; inp = inp->i_nexthash) { if (inp->i_number != inumber) continue; return (inp); } errx(EEXIT, "cannot find inode %ju", (uintmax_t)inumber); return ((struct inoinfo *)0); } /* * Clean up all the inode cache structure. */ void inocleanup(void) { struct inoinfo **inpp; if (inphead == NULL) return; for (inpp = &inpsort[inplast - 1]; inpp >= inpsort; inpp--) free((char *)(*inpp)); free((char *)inphead); free((char *)inpsort); inphead = inpsort = NULL; } void inodirty(void) { dirty(pbp); } void clri(struct inodesc *idesc, const char *type, int flag) { union dinode *dp; dp = ginode(idesc->id_number); if (flag == 1) { pwarn("%s %s", type, (DIP(dp, di_mode) & IFMT) == IFDIR ? "DIR" : "FILE"); pinode(idesc->id_number); } if (preen || reply("CLEAR") == 1) { if (preen) printf(" (CLEARED)\n"); n_files--; if (bkgrdflag == 0) { (void)ckinode(dp, idesc); inoinfo(idesc->id_number)->ino_state = USTATE; clearinode(dp); inodirty(); } else { cmd.value = idesc->id_number; cmd.size = -DIP(dp, di_nlink); if (debug) printf("adjrefcnt ino %ld amt %lld\n", (long)cmd.value, (long long)cmd.size); if (sysctl(adjrefcnt, MIBSIZE, 0, 0, &cmd, sizeof cmd) == -1) rwerror("ADJUST INODE", cmd.value); } } } int findname(struct inodesc *idesc) { struct direct *dirp = idesc->id_dirp; if (dirp->d_ino != idesc->id_parent || idesc->id_entryno < 2) { idesc->id_entryno++; return (KEEPON); } memmove(idesc->id_name, dirp->d_name, (size_t)dirp->d_namlen + 1); return (STOP|FOUND); } int findino(struct inodesc *idesc) { struct direct *dirp = idesc->id_dirp; if (dirp->d_ino == 0) return (KEEPON); if (strcmp(dirp->d_name, idesc->id_name) == 0 && - dirp->d_ino >= ROOTINO && dirp->d_ino <= maxino) { + dirp->d_ino >= UFS_ROOTINO && dirp->d_ino <= maxino) { idesc->id_parent = dirp->d_ino; return (STOP|FOUND); } return (KEEPON); } int clearentry(struct inodesc *idesc) { struct direct *dirp = idesc->id_dirp; if (dirp->d_ino != idesc->id_parent || idesc->id_entryno < 2) { idesc->id_entryno++; return (KEEPON); } dirp->d_ino = 0; return (STOP|FOUND|ALTERED); } void pinode(ino_t ino) { union dinode *dp; char *p; struct passwd *pw; time_t t; printf(" I=%lu ", (u_long)ino); - if (ino < ROOTINO || ino > maxino) + if (ino < UFS_ROOTINO || ino > maxino) return; dp = ginode(ino); printf(" OWNER="); if ((pw = getpwuid((int)DIP(dp, di_uid))) != NULL) printf("%s ", pw->pw_name); else printf("%u ", (unsigned)DIP(dp, di_uid)); printf("MODE=%o\n", DIP(dp, di_mode)); if (preen) printf("%s: ", cdevname); printf("SIZE=%ju ", (uintmax_t)DIP(dp, di_size)); t = DIP(dp, di_mtime); p = ctime(&t); printf("MTIME=%12.12s %4.4s ", &p[4], &p[20]); } void blkerror(ino_t ino, const char *type, ufs2_daddr_t blk) { pfatal("%jd %s I=%ju", (intmax_t)blk, type, (uintmax_t)ino); printf("\n"); switch (inoinfo(ino)->ino_state) { case FSTATE: case FZLINK: inoinfo(ino)->ino_state = FCLEAR; return; case DSTATE: case DZLINK: inoinfo(ino)->ino_state = DCLEAR; return; case FCLEAR: case DCLEAR: return; default: errx(EEXIT, "BAD STATE %d TO BLKERR", inoinfo(ino)->ino_state); /* NOTREACHED */ } } /* * allocate an unused inode */ ino_t allocino(ino_t request, int type) { ino_t ino; union dinode *dp; struct bufarea *cgbp; struct cg *cgp; int cg; if (request == 0) - request = ROOTINO; + request = UFS_ROOTINO; else if (inoinfo(request)->ino_state != USTATE) return (0); for (ino = request; ino < maxino; ino++) if (inoinfo(ino)->ino_state == USTATE) break; if (ino == maxino) return (0); cg = ino_to_cg(&sblock, ino); cgbp = cgget(cg); cgp = cgbp->b_un.b_cg; if (!check_cgmagic(cg, cgbp)) return (0); setbit(cg_inosused(cgp), ino % sblock.fs_ipg); cgp->cg_cs.cs_nifree--; switch (type & IFMT) { case IFDIR: inoinfo(ino)->ino_state = DSTATE; cgp->cg_cs.cs_ndir++; break; case IFREG: case IFLNK: inoinfo(ino)->ino_state = FSTATE; break; default: return (0); } dirty(cgbp); dp = ginode(ino); DIP_SET(dp, di_db[0], allocblk((long)1)); if (DIP(dp, di_db[0]) == 0) { inoinfo(ino)->ino_state = USTATE; return (0); } DIP_SET(dp, di_mode, type); DIP_SET(dp, di_flags, 0); DIP_SET(dp, di_atime, time(NULL)); DIP_SET(dp, di_ctime, DIP(dp, di_atime)); DIP_SET(dp, di_mtime, DIP(dp, di_ctime)); DIP_SET(dp, di_mtimensec, 0); DIP_SET(dp, di_ctimensec, 0); DIP_SET(dp, di_atimensec, 0); DIP_SET(dp, di_size, sblock.fs_fsize); DIP_SET(dp, di_blocks, btodb(sblock.fs_fsize)); n_files++; inodirty(); inoinfo(ino)->ino_type = IFTODT(type); return (ino); } /* * deallocate an inode */ void freeino(ino_t ino) { struct inodesc idesc; union dinode *dp; memset(&idesc, 0, sizeof(struct inodesc)); idesc.id_type = ADDR; idesc.id_func = pass4check; idesc.id_number = ino; dp = ginode(ino); (void)ckinode(dp, &idesc); clearinode(dp); inodirty(); inoinfo(ino)->ino_state = USTATE; n_files--; } Index: head/sbin/fsck_ffs/main.c =================================================================== --- head/sbin/fsck_ffs/main.c (revision 313779) +++ head/sbin/fsck_ffs/main.c (revision 313780) @@ -1,688 +1,688 @@ /* * Copyright (c) 1980, 1986, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #if 0 #ifndef lint static const char copyright[] = "@(#) Copyright (c) 1980, 1986, 1993\n\ The Regents of the University of California. All rights reserved.\n"; #endif /* not lint */ #ifndef lint static char sccsid[] = "@(#)main.c 8.6 (Berkeley) 5/14/95"; #endif /* not lint */ #endif #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "fsck.h" int restarts; static void usage(void) __dead2; static intmax_t argtoimax(int flag, const char *req, const char *str, int base); static int checkfilesys(char *filesys); static int chkdoreload(struct statfs *mntp); static struct statfs *getmntpt(const char *); int main(int argc, char *argv[]) { int ch; struct rlimit rlimit; struct itimerval itimerval; int ret = 0; sync(); skipclean = 1; inoopt = 0; while ((ch = getopt(argc, argv, "b:Bc:CdEfFm:npRrSyZ")) != -1) { switch (ch) { case 'b': skipclean = 0; bflag = argtoimax('b', "number", optarg, 10); printf("Alternate super block location: %jd\n", bflag); break; case 'B': bkgrdflag = 1; break; case 'c': skipclean = 0; cvtlevel = argtoimax('c', "conversion level", optarg, 10); if (cvtlevel < 3) errx(EEXIT, "cannot do level %d conversion", cvtlevel); break; case 'd': debug++; break; case 'E': Eflag++; break; case 'f': skipclean = 0; break; case 'F': bkgrdcheck = 1; break; case 'm': lfmode = argtoimax('m', "mode", optarg, 8); if (lfmode &~ 07777) errx(EEXIT, "bad mode to -m: %o", lfmode); printf("** lost+found creation mode %o\n", lfmode); break; case 'n': nflag++; yflag = 0; break; case 'p': preen++; /*FALLTHROUGH*/ case 'C': ckclean++; break; case 'R': wantrestart = 1; break; case 'r': inoopt++; break; case 'S': surrender = 1; break; case 'y': yflag++; nflag = 0; break; case 'Z': Zflag++; break; default: usage(); } } argc -= optind; argv += optind; if (!argc) usage(); if (signal(SIGINT, SIG_IGN) != SIG_IGN) (void)signal(SIGINT, catch); if (ckclean) (void)signal(SIGQUIT, catchquit); signal(SIGINFO, infohandler); if (bkgrdflag) { signal(SIGALRM, alarmhandler); itimerval.it_interval.tv_sec = 5; itimerval.it_interval.tv_usec = 0; itimerval.it_value.tv_sec = 5; itimerval.it_value.tv_usec = 0; setitimer(ITIMER_REAL, &itimerval, NULL); } /* * Push up our allowed memory limit so we can cope * with huge file systems. */ if (getrlimit(RLIMIT_DATA, &rlimit) == 0) { rlimit.rlim_cur = rlimit.rlim_max; (void)setrlimit(RLIMIT_DATA, &rlimit); } while (argc > 0) { if (checkfilesys(*argv) == ERESTART) continue; argc--; argv++; } if (returntosingle) ret = 2; exit(ret); } static intmax_t argtoimax(int flag, const char *req, const char *str, int base) { char *cp; intmax_t ret; ret = strtoimax(str, &cp, base); if (cp == str || *cp) errx(EEXIT, "-%c flag requires a %s", flag, req); return (ret); } /* * Check the specified file system. */ /* ARGSUSED */ static int checkfilesys(char *filesys) { ufs2_daddr_t n_ffree, n_bfree; struct dups *dp; struct statfs *mntp; struct stat snapdir; struct group *grp; struct iovec *iov; char errmsg[255]; int iovlen; int cylno; intmax_t blks, files; size_t size; iov = NULL; iovlen = 0; errmsg[0] = '\0'; fsutilinit(); fsckinit(); cdevname = filesys; if (debug && ckclean) pwarn("starting\n"); /* * Make best effort to get the disk name. Check first to see * if it is listed among the mounted file systems. Failing that * check to see if it is listed in /etc/fstab. */ mntp = getmntpt(filesys); if (mntp != NULL) filesys = mntp->f_mntfromname; else filesys = blockcheck(filesys); /* * If -F flag specified, check to see whether a background check * is possible and needed. If possible and needed, exit with * status zero. Otherwise exit with status non-zero. A non-zero * exit status will cause a foreground check to be run. */ sblock_init(); if (bkgrdcheck) { if ((fsreadfd = open(filesys, O_RDONLY)) < 0 || readsb(0) == 0) exit(3); /* Cannot read superblock */ close(fsreadfd); /* Earlier background failed or journaled */ if (sblock.fs_flags & (FS_NEEDSFSCK | FS_SUJ)) exit(4); if ((sblock.fs_flags & FS_DOSOFTDEP) == 0) exit(5); /* Not running soft updates */ size = MIBSIZE; if (sysctlnametomib("vfs.ffs.adjrefcnt", adjrefcnt, &size) < 0) exit(6); /* Lacks kernel support */ if ((mntp == NULL && sblock.fs_clean == 1) || (mntp != NULL && (sblock.fs_flags & FS_UNCLEAN) == 0)) exit(7); /* Filesystem clean, report it now */ exit(0); } if (ckclean && skipclean) { /* * If file system is gjournaled, check it here. */ if ((fsreadfd = open(filesys, O_RDONLY)) < 0 || readsb(0) == 0) exit(3); /* Cannot read superblock */ close(fsreadfd); if ((sblock.fs_flags & FS_GJOURNAL) != 0) { //printf("GJournaled file system detected on %s.\n", // filesys); if (sblock.fs_clean == 1) { pwarn("FILE SYSTEM CLEAN; SKIPPING CHECKS\n"); exit(0); } if ((sblock.fs_flags & (FS_UNCLEAN | FS_NEEDSFSCK)) == 0) { gjournal_check(filesys); if (chkdoreload(mntp) == 0) exit(0); exit(4); } else { pfatal( "UNEXPECTED INCONSISTENCY, CANNOT RUN FAST FSCK\n"); } } } /* * If we are to do a background check: * Get the mount point information of the file system * create snapshot file * return created snapshot file * if not found, clear bkgrdflag and proceed with normal fsck */ if (bkgrdflag) { if (mntp == NULL) { bkgrdflag = 0; pfatal("NOT MOUNTED, CANNOT RUN IN BACKGROUND\n"); } else if ((mntp->f_flags & MNT_SOFTDEP) == 0) { bkgrdflag = 0; pfatal( "NOT USING SOFT UPDATES, CANNOT RUN IN BACKGROUND\n"); } else if ((mntp->f_flags & MNT_RDONLY) != 0) { bkgrdflag = 0; pfatal("MOUNTED READ-ONLY, CANNOT RUN IN BACKGROUND\n"); } else if ((fsreadfd = open(filesys, O_RDONLY)) >= 0) { if (readsb(0) != 0) { if (sblock.fs_flags & (FS_NEEDSFSCK | FS_SUJ)) { bkgrdflag = 0; pfatal( "UNEXPECTED INCONSISTENCY, CANNOT RUN IN BACKGROUND\n"); } if ((sblock.fs_flags & FS_UNCLEAN) == 0 && skipclean && ckclean) { /* * file system is clean; * skip snapshot and report it clean */ pwarn( "FILE SYSTEM CLEAN; SKIPPING CHECKS\n"); goto clean; } } close(fsreadfd); } if (bkgrdflag) { snprintf(snapname, sizeof snapname, "%s/.snap", mntp->f_mntonname); if (stat(snapname, &snapdir) < 0) { if (errno != ENOENT) { bkgrdflag = 0; pfatal( "CANNOT FIND SNAPSHOT DIRECTORY %s: %s, CANNOT RUN IN BACKGROUND\n", snapname, strerror(errno)); } else if ((grp = getgrnam("operator")) == NULL || mkdir(snapname, 0770) < 0 || chown(snapname, -1, grp->gr_gid) < 0 || chmod(snapname, 0770) < 0) { bkgrdflag = 0; pfatal( "CANNOT CREATE SNAPSHOT DIRECTORY %s: %s, CANNOT RUN IN BACKGROUND\n", snapname, strerror(errno)); } } else if (!S_ISDIR(snapdir.st_mode)) { bkgrdflag = 0; pfatal( "%s IS NOT A DIRECTORY, CANNOT RUN IN BACKGROUND\n", snapname); } } if (bkgrdflag) { snprintf(snapname, sizeof snapname, "%s/.snap/fsck_snapshot", mntp->f_mntonname); build_iovec(&iov, &iovlen, "fstype", "ffs", 4); build_iovec(&iov, &iovlen, "from", snapname, (size_t)-1); build_iovec(&iov, &iovlen, "fspath", mntp->f_mntonname, (size_t)-1); build_iovec(&iov, &iovlen, "errmsg", errmsg, sizeof(errmsg)); build_iovec(&iov, &iovlen, "update", NULL, 0); build_iovec(&iov, &iovlen, "snapshot", NULL, 0); while (nmount(iov, iovlen, mntp->f_flags) < 0) { if (errno == EEXIST && unlink(snapname) == 0) continue; bkgrdflag = 0; pfatal("CANNOT CREATE SNAPSHOT %s: %s %s\n", snapname, strerror(errno), errmsg); break; } if (bkgrdflag != 0) filesys = snapname; } } switch (setup(filesys)) { case 0: if (preen) pfatal("CAN'T CHECK FILE SYSTEM."); return (0); case -1: clean: pwarn("clean, %ld free ", (long)(sblock.fs_cstotal.cs_nffree + sblock.fs_frag * sblock.fs_cstotal.cs_nbfree)); printf("(%jd frags, %jd blocks, %.1f%% fragmentation)\n", (intmax_t)sblock.fs_cstotal.cs_nffree, (intmax_t)sblock.fs_cstotal.cs_nbfree, sblock.fs_cstotal.cs_nffree * 100.0 / sblock.fs_dsize); return (0); } /* * Determine if we can and should do journal recovery. */ if ((sblock.fs_flags & FS_SUJ) == FS_SUJ) { if ((sblock.fs_flags & FS_NEEDSFSCK) != FS_NEEDSFSCK && skipclean) { if (preen || reply("USE JOURNAL")) { if (suj_check(filesys) == 0) { printf("\n***** FILE SYSTEM MARKED CLEAN *****\n"); if (chkdoreload(mntp) == 0) exit(0); exit(4); } } printf("** Skipping journal, falling through to full fsck\n\n"); } /* * Write the superblock so we don't try to recover the * journal on another pass. */ sblock.fs_mtime = time(NULL); sbdirty(); } /* * Cleared if any questions answered no. Used to decide if * the superblock should be marked clean. */ resolved = 1; /* * 1: scan inodes tallying blocks used */ if (preen == 0) { printf("** Last Mounted on %s\n", sblock.fs_fsmnt); if (mntp != NULL && mntp->f_flags & MNT_ROOTFS) printf("** Root file system\n"); printf("** Phase 1 - Check Blocks and Sizes\n"); } clock_gettime(CLOCK_REALTIME_PRECISE, &startprog); pass1(); IOstats("Pass1"); /* * 1b: locate first references to duplicates, if any */ if (duplist) { if (preen || usedsoftdep) pfatal("INTERNAL ERROR: dups with %s%s%s", preen ? "-p" : "", (preen && usedsoftdep) ? " and " : "", usedsoftdep ? "softupdates" : ""); printf("** Phase 1b - Rescan For More DUPS\n"); pass1b(); IOstats("Pass1b"); } /* * 2: traverse directories from root to mark all connected directories */ if (preen == 0) printf("** Phase 2 - Check Pathnames\n"); pass2(); IOstats("Pass2"); /* * 3: scan inodes looking for disconnected directories */ if (preen == 0) printf("** Phase 3 - Check Connectivity\n"); pass3(); IOstats("Pass3"); /* * 4: scan inodes looking for disconnected files; check reference counts */ if (preen == 0) printf("** Phase 4 - Check Reference Counts\n"); pass4(); IOstats("Pass4"); /* * 5: check and repair resource counts in cylinder groups */ if (preen == 0) printf("** Phase 5 - Check Cyl groups\n"); pass5(); IOstats("Pass5"); /* * print out summary statistics */ n_ffree = sblock.fs_cstotal.cs_nffree; n_bfree = sblock.fs_cstotal.cs_nbfree; - files = maxino - ROOTINO - sblock.fs_cstotal.cs_nifree - n_files; + files = maxino - UFS_ROOTINO - sblock.fs_cstotal.cs_nifree - n_files; blks = n_blks + sblock.fs_ncg * (cgdmin(&sblock, 0) - cgsblock(&sblock, 0)); blks += cgsblock(&sblock, 0) - cgbase(&sblock, 0); blks += howmany(sblock.fs_cssize, sblock.fs_fsize); blks = maxfsblock - (n_ffree + sblock.fs_frag * n_bfree) - blks; if (bkgrdflag && (files > 0 || blks > 0)) { countdirs = sblock.fs_cstotal.cs_ndir - countdirs; pwarn("Reclaimed: %ld directories, %jd files, %jd fragments\n", countdirs, files - countdirs, blks); } pwarn("%ld files, %jd used, %ju free ", (long)n_files, (intmax_t)n_blks, (uintmax_t)n_ffree + sblock.fs_frag * n_bfree); printf("(%ju frags, %ju blocks, %.1f%% fragmentation)\n", (uintmax_t)n_ffree, (uintmax_t)n_bfree, n_ffree * 100.0 / sblock.fs_dsize); if (debug) { if (files < 0) printf("%jd inodes missing\n", -files); if (blks < 0) printf("%jd blocks missing\n", -blks); if (duplist != NULL) { printf("The following duplicate blocks remain:"); for (dp = duplist; dp; dp = dp->next) printf(" %jd,", (intmax_t)dp->dup); printf("\n"); } } duplist = (struct dups *)0; muldup = (struct dups *)0; inocleanup(); if (fsmodified) { sblock.fs_time = time(NULL); sbdirty(); } if (cvtlevel && sblk.b_dirty) { /* * Write out the duplicate super blocks */ for (cylno = 0; cylno < sblock.fs_ncg; cylno++) blwrite(fswritefd, (char *)&sblock, fsbtodb(&sblock, cgsblock(&sblock, cylno)), SBLOCKSIZE); } if (rerun) resolved = 0; finalIOstats(); /* * Check to see if the file system is mounted read-write. */ if (bkgrdflag == 0 && mntp != NULL && (mntp->f_flags & MNT_RDONLY) == 0) resolved = 0; ckfini(resolved); for (cylno = 0; cylno < sblock.fs_ncg; cylno++) if (inostathead[cylno].il_stat != NULL) free((char *)inostathead[cylno].il_stat); free((char *)inostathead); inostathead = NULL; if (fsmodified && !preen) printf("\n***** FILE SYSTEM WAS MODIFIED *****\n"); if (rerun) { if (wantrestart && (restarts++ < 10) && (preen || reply("RESTART"))) return (ERESTART); printf("\n***** PLEASE RERUN FSCK *****\n"); } if (chkdoreload(mntp) != 0) { if (!fsmodified) return (0); if (!preen) printf("\n***** REBOOT NOW *****\n"); sync(); return (4); } return (0); } static int chkdoreload(struct statfs *mntp) { struct iovec *iov; int iovlen; char errmsg[255]; if (mntp == NULL) return (0); iov = NULL; iovlen = 0; errmsg[0] = '\0'; /* * We modified a mounted file system. Do a mount update on * it unless it is read-write, so we can continue using it * as safely as possible. */ if (mntp->f_flags & MNT_RDONLY) { build_iovec(&iov, &iovlen, "fstype", "ffs", 4); build_iovec(&iov, &iovlen, "from", mntp->f_mntfromname, (size_t)-1); build_iovec(&iov, &iovlen, "fspath", mntp->f_mntonname, (size_t)-1); build_iovec(&iov, &iovlen, "errmsg", errmsg, sizeof(errmsg)); build_iovec(&iov, &iovlen, "update", NULL, 0); build_iovec(&iov, &iovlen, "reload", NULL, 0); /* * XX: We need the following line until we clean up * nmount parsing of root mounts and NFS root mounts. */ build_iovec(&iov, &iovlen, "ro", NULL, 0); if (nmount(iov, iovlen, mntp->f_flags) == 0) { return (0); } pwarn("mount reload of '%s' failed: %s %s\n\n", mntp->f_mntonname, strerror(errno), errmsg); return (1); } return (0); } /* * Get the mount point information for name. */ static struct statfs * getmntpt(const char *name) { struct stat devstat, mntdevstat; char device[sizeof(_PATH_DEV) - 1 + MNAMELEN]; char *ddevname; struct statfs *mntbuf, *statfsp; int i, mntsize, isdev; if (stat(name, &devstat) != 0) return (NULL); if (S_ISCHR(devstat.st_mode) || S_ISBLK(devstat.st_mode)) isdev = 1; else isdev = 0; mntsize = getmntinfo(&mntbuf, MNT_NOWAIT); for (i = 0; i < mntsize; i++) { statfsp = &mntbuf[i]; ddevname = statfsp->f_mntfromname; if (*ddevname != '/') { if (strlen(_PATH_DEV) + strlen(ddevname) + 1 > sizeof(statfsp->f_mntfromname)) continue; strcpy(device, _PATH_DEV); strcat(device, ddevname); strcpy(statfsp->f_mntfromname, device); } if (isdev == 0) { if (strcmp(name, statfsp->f_mntonname)) continue; return (statfsp); } if (stat(ddevname, &mntdevstat) == 0 && mntdevstat.st_rdev == devstat.st_rdev) return (statfsp); } statfsp = NULL; return (statfsp); } static void usage(void) { (void) fprintf(stderr, "usage: %s [-BCdEFfnpRrSyZ] [-b block] [-c level] [-m mode] filesystem ...\n", getprogname()); exit(1); } void infohandler(int sig __unused) { got_siginfo = 1; } void alarmhandler(int sig __unused) { got_sigalarm = 1; } Index: head/sbin/fsck_ffs/pass1.c =================================================================== --- head/sbin/fsck_ffs/pass1.c (revision 313779) +++ head/sbin/fsck_ffs/pass1.c (revision 313780) @@ -1,522 +1,522 @@ /* * Copyright (c) 1980, 1986, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #if 0 #ifndef lint static const char sccsid[] = "@(#)pass1.c 8.6 (Berkeley) 4/28/95"; #endif /* not lint */ #endif #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include "fsck.h" static ufs2_daddr_t badblk; static ufs2_daddr_t dupblk; static ino_t lastino; /* last inode in use */ static int checkinode(ino_t inumber, struct inodesc *, int rebuildcg); void pass1(void) { struct inostat *info; struct inodesc idesc; struct bufarea *cgbp; struct cg *cgp; ino_t inumber, inosused, mininos; ufs2_daddr_t i, cgd; u_int8_t *cp; int c, rebuildcg; badblk = dupblk = lastino = 0; /* * Set file system reserved blocks in used block map. */ for (c = 0; c < sblock.fs_ncg; c++) { cgd = cgdmin(&sblock, c); if (c == 0) { i = cgbase(&sblock, c); } else i = cgsblock(&sblock, c); for (; i < cgd; i++) setbmap(i); } i = sblock.fs_csaddr; cgd = i + howmany(sblock.fs_cssize, sblock.fs_fsize); for (; i < cgd; i++) setbmap(i); /* * Find all allocated blocks. */ memset(&idesc, 0, sizeof(struct inodesc)); idesc.id_func = pass1check; n_files = n_blks = 0; for (c = 0; c < sblock.fs_ncg; c++) { inumber = c * sblock.fs_ipg; setinodebuf(inumber); cgbp = cgget(c); cgp = cgbp->b_un.b_cg; rebuildcg = 0; if (!check_cgmagic(c, cgbp)) rebuildcg = 1; if (!rebuildcg && sblock.fs_magic == FS_UFS2_MAGIC) { inosused = cgp->cg_initediblk; if (inosused > sblock.fs_ipg) { pfatal( "Too many initialized inodes (%ju > %d) in cylinder group %d\nReset to %d\n", (uintmax_t)inosused, sblock.fs_ipg, c, sblock.fs_ipg); inosused = sblock.fs_ipg; } } else { inosused = sblock.fs_ipg; } if (got_siginfo) { printf("%s: phase 1: cyl group %d of %d (%d%%)\n", cdevname, c, sblock.fs_ncg, c * 100 / sblock.fs_ncg); got_siginfo = 0; } if (got_sigalarm) { setproctitle("%s p1 %d%%", cdevname, c * 100 / sblock.fs_ncg); got_sigalarm = 0; } /* * If we are using soft updates, then we can trust the * cylinder group inode allocation maps to tell us which * inodes are allocated. We will scan the used inode map * to find the inodes that are really in use, and then * read only those inodes in from disk. */ if ((preen || inoopt) && usedsoftdep && !rebuildcg) { cp = &cg_inosused(cgp)[(inosused - 1) / CHAR_BIT]; for ( ; inosused > 0; inosused -= CHAR_BIT, cp--) { if (*cp == 0) continue; for (i = 1 << (CHAR_BIT - 1); i > 0; i >>= 1) { if (*cp & i) break; inosused--; } break; } if (inosused < 0) inosused = 0; } /* * Allocate inoinfo structures for the allocated inodes. */ inostathead[c].il_numalloced = inosused; if (inosused == 0) { inostathead[c].il_stat = NULL; continue; } info = Calloc((unsigned)inosused, sizeof(struct inostat)); if (info == NULL) errx(EEXIT, "cannot alloc %u bytes for inoinfo", (unsigned)(sizeof(struct inostat) * inosused)); inostathead[c].il_stat = info; /* * Scan the allocated inodes. */ for (i = 0; i < inosused; i++, inumber++) { - if (inumber < ROOTINO) { + if (inumber < UFS_ROOTINO) { (void)getnextinode(inumber, rebuildcg); continue; } /* * NULL return indicates probable end of allocated * inodes during cylinder group rebuild attempt. * We always keep trying until we get to the minimum * valid number for this cylinder group. */ if (checkinode(inumber, &idesc, rebuildcg) == 0 && i > cgp->cg_initediblk) break; } /* * This optimization speeds up future runs of fsck * by trimming down the number of inodes in cylinder * groups that formerly had many inodes but now have * fewer in use. */ mininos = roundup(inosused + INOPB(&sblock), INOPB(&sblock)); if (inoopt && !preen && !rebuildcg && sblock.fs_magic == FS_UFS2_MAGIC && cgp->cg_initediblk > 2 * INOPB(&sblock) && mininos < cgp->cg_initediblk) { i = cgp->cg_initediblk; if (mininos < 2 * INOPB(&sblock)) cgp->cg_initediblk = 2 * INOPB(&sblock); else cgp->cg_initediblk = mininos; pwarn("CYLINDER GROUP %d: RESET FROM %ju TO %d %s\n", c, i, cgp->cg_initediblk, "VALID INODES"); dirty(cgbp); } if (inosused < sblock.fs_ipg) continue; lastino += 1; if (lastino < (c * sblock.fs_ipg)) inosused = 0; else inosused = lastino - (c * sblock.fs_ipg); if (rebuildcg && inosused > cgp->cg_initediblk && sblock.fs_magic == FS_UFS2_MAGIC) { cgp->cg_initediblk = roundup(inosused, INOPB(&sblock)); pwarn("CYLINDER GROUP %d: FOUND %d VALID INODES\n", c, cgp->cg_initediblk); } /* * If we were not able to determine in advance which inodes * were in use, then reduce the size of the inoinfo structure * to the size necessary to describe the inodes that we * really found. */ if (inumber == lastino) continue; inostathead[c].il_numalloced = inosused; if (inosused == 0) { free(inostathead[c].il_stat); inostathead[c].il_stat = NULL; continue; } info = Calloc((unsigned)inosused, sizeof(struct inostat)); if (info == NULL) errx(EEXIT, "cannot alloc %u bytes for inoinfo", (unsigned)(sizeof(struct inostat) * inosused)); memmove(info, inostathead[c].il_stat, inosused * sizeof(*info)); free(inostathead[c].il_stat); inostathead[c].il_stat = info; } freeinodebuf(); } static int checkinode(ino_t inumber, struct inodesc *idesc, int rebuildcg) { union dinode *dp; off_t kernmaxfilesize; ufs2_daddr_t ndb; mode_t mode; int j, ret, offset; if ((dp = getnextinode(inumber, rebuildcg)) == NULL) return (0); mode = DIP(dp, di_mode) & IFMT; if (mode == 0) { if ((sblock.fs_magic == FS_UFS1_MAGIC && (memcmp(dp->dp1.di_db, ufs1_zino.di_db, - NDADDR * sizeof(ufs1_daddr_t)) || + UFS_NDADDR * sizeof(ufs1_daddr_t)) || memcmp(dp->dp1.di_ib, ufs1_zino.di_ib, - NIADDR * sizeof(ufs1_daddr_t)) || + UFS_NIADDR * sizeof(ufs1_daddr_t)) || dp->dp1.di_mode || dp->dp1.di_size)) || (sblock.fs_magic == FS_UFS2_MAGIC && (memcmp(dp->dp2.di_db, ufs2_zino.di_db, - NDADDR * sizeof(ufs2_daddr_t)) || + UFS_NDADDR * sizeof(ufs2_daddr_t)) || memcmp(dp->dp2.di_ib, ufs2_zino.di_ib, - NIADDR * sizeof(ufs2_daddr_t)) || + UFS_NIADDR * sizeof(ufs2_daddr_t)) || dp->dp2.di_mode || dp->dp2.di_size))) { pfatal("PARTIALLY ALLOCATED INODE I=%lu", (u_long)inumber); if (reply("CLEAR") == 1) { dp = ginode(inumber); clearinode(dp); inodirty(); } } inoinfo(inumber)->ino_state = USTATE; return (1); } lastino = inumber; /* This should match the file size limit in ffs_mountfs(). */ if (sblock.fs_magic == FS_UFS1_MAGIC) kernmaxfilesize = (off_t)0x40000000 * sblock.fs_bsize - 1; else kernmaxfilesize = sblock.fs_maxfilesize; if (DIP(dp, di_size) > kernmaxfilesize || DIP(dp, di_size) > sblock.fs_maxfilesize || (mode == IFDIR && DIP(dp, di_size) > MAXDIRSIZE)) { if (debug) printf("bad size %ju:", (uintmax_t)DIP(dp, di_size)); goto unknown; } if (!preen && mode == IFMT && reply("HOLD BAD BLOCK") == 1) { dp = ginode(inumber); DIP_SET(dp, di_size, sblock.fs_fsize); DIP_SET(dp, di_mode, IFREG|0600); inodirty(); } if ((mode == IFBLK || mode == IFCHR || mode == IFIFO || mode == IFSOCK) && DIP(dp, di_size) != 0) { if (debug) printf("bad special-file size %ju:", (uintmax_t)DIP(dp, di_size)); goto unknown; } if ((mode == IFBLK || mode == IFCHR) && (dev_t)DIP(dp, di_rdev) == NODEV) { if (debug) printf("bad special-file rdev NODEV:"); goto unknown; } ndb = howmany(DIP(dp, di_size), sblock.fs_bsize); if (ndb < 0) { if (debug) printf("bad size %ju ndb %ju:", (uintmax_t)DIP(dp, di_size), (uintmax_t)ndb); goto unknown; } if (mode == IFBLK || mode == IFCHR) ndb++; if (mode == IFLNK) { /* * Fake ndb value so direct/indirect block checks below * will detect any garbage after symlink string. */ if (DIP(dp, di_size) < (off_t)sblock.fs_maxsymlinklen) { if (sblock.fs_magic == FS_UFS1_MAGIC) ndb = howmany(DIP(dp, di_size), sizeof(ufs1_daddr_t)); else ndb = howmany(DIP(dp, di_size), sizeof(ufs2_daddr_t)); - if (ndb > NDADDR) { - j = ndb - NDADDR; + if (ndb > UFS_NDADDR) { + j = ndb - UFS_NDADDR; for (ndb = 1; j > 1; j--) ndb *= NINDIR(&sblock); - ndb += NDADDR; + ndb += UFS_NDADDR; } } } - for (j = ndb; ndb < NDADDR && j < NDADDR; j++) + for (j = ndb; ndb < UFS_NDADDR && j < UFS_NDADDR; j++) if (DIP(dp, di_db[j]) != 0) { if (debug) printf("bad direct addr[%d]: %ju\n", j, (uintmax_t)DIP(dp, di_db[j])); goto unknown; } - for (j = 0, ndb -= NDADDR; ndb > 0; j++) + for (j = 0, ndb -= UFS_NDADDR; ndb > 0; j++) ndb /= NINDIR(&sblock); - for (; j < NIADDR; j++) + for (; j < UFS_NIADDR; j++) if (DIP(dp, di_ib[j]) != 0) { if (debug) printf("bad indirect addr: %ju\n", (uintmax_t)DIP(dp, di_ib[j])); goto unknown; } if (ftypeok(dp) == 0) goto unknown; n_files++; inoinfo(inumber)->ino_linkcnt = DIP(dp, di_nlink); if (mode == IFDIR) { if (DIP(dp, di_size) == 0) inoinfo(inumber)->ino_state = DCLEAR; else if (DIP(dp, di_nlink) <= 0) inoinfo(inumber)->ino_state = DZLINK; else inoinfo(inumber)->ino_state = DSTATE; cacheino(dp, inumber); countdirs++; } else if (DIP(dp, di_nlink) <= 0) inoinfo(inumber)->ino_state = FZLINK; else inoinfo(inumber)->ino_state = FSTATE; inoinfo(inumber)->ino_type = IFTODT(mode); badblk = dupblk = 0; idesc->id_number = inumber; if (DIP(dp, di_flags) & SF_SNAPSHOT) idesc->id_type = SNAP; else idesc->id_type = ADDR; (void)ckinode(dp, idesc); if (sblock.fs_magic == FS_UFS2_MAGIC && dp->dp2.di_extsize > 0) { idesc->id_type = ADDR; ndb = howmany(dp->dp2.di_extsize, sblock.fs_bsize); - for (j = 0; j < NXADDR; j++) { + for (j = 0; j < UFS_NXADDR; j++) { if (--ndb == 0 && (offset = blkoff(&sblock, dp->dp2.di_extsize)) != 0) idesc->id_numfrags = numfrags(&sblock, fragroundup(&sblock, offset)); else idesc->id_numfrags = sblock.fs_frag; if (dp->dp2.di_extb[j] == 0) continue; idesc->id_blkno = dp->dp2.di_extb[j]; ret = (*idesc->id_func)(idesc); if (ret & STOP) break; } } if (sblock.fs_magic == FS_UFS2_MAGIC) eascan(idesc, &dp->dp2); idesc->id_entryno *= btodb(sblock.fs_fsize); if (DIP(dp, di_blocks) != idesc->id_entryno) { pwarn("INCORRECT BLOCK COUNT I=%lu (%ju should be %ju)", (u_long)inumber, (uintmax_t)DIP(dp, di_blocks), (uintmax_t)idesc->id_entryno); if (preen) printf(" (CORRECTED)\n"); else if (reply("CORRECT") == 0) return (1); if (bkgrdflag == 0) { dp = ginode(inumber); DIP_SET(dp, di_blocks, idesc->id_entryno); inodirty(); } else { cmd.value = idesc->id_number; cmd.size = idesc->id_entryno - DIP(dp, di_blocks); if (debug) printf("adjblkcnt ino %ju amount %lld\n", (uintmax_t)cmd.value, (long long)cmd.size); if (sysctl(adjblkcnt, MIBSIZE, 0, 0, &cmd, sizeof cmd) == -1) rwerror("ADJUST INODE BLOCK COUNT", cmd.value); } } return (1); unknown: pfatal("UNKNOWN FILE TYPE I=%lu", (u_long)inumber); inoinfo(inumber)->ino_state = FCLEAR; if (reply("CLEAR") == 1) { inoinfo(inumber)->ino_state = USTATE; dp = ginode(inumber); clearinode(dp); inodirty(); } return (1); } int pass1check(struct inodesc *idesc) { int res = KEEPON; int anyout, nfrags; ufs2_daddr_t blkno = idesc->id_blkno; struct dups *dlp; struct dups *new; if (idesc->id_type == SNAP) { if (blkno == BLK_NOCOPY) return (KEEPON); if (idesc->id_number == cursnapshot) { if (blkno == blkstofrags(&sblock, idesc->id_lbn)) return (KEEPON); if (blkno == BLK_SNAP) { blkno = blkstofrags(&sblock, idesc->id_lbn); idesc->id_entryno -= idesc->id_numfrags; } } else { if (blkno == BLK_SNAP) return (KEEPON); } } if ((anyout = chkrange(blkno, idesc->id_numfrags)) != 0) { blkerror(idesc->id_number, "BAD", blkno); if (badblk++ >= MAXBAD) { pwarn("EXCESSIVE BAD BLKS I=%lu", (u_long)idesc->id_number); if (preen) printf(" (SKIPPING)\n"); else if (reply("CONTINUE") == 0) { ckfini(0); exit(EEXIT); } rerun = 1; return (STOP); } } for (nfrags = idesc->id_numfrags; nfrags > 0; blkno++, nfrags--) { if (anyout && chkrange(blkno, 1)) { res = SKIP; } else if (!testbmap(blkno)) { n_blks++; setbmap(blkno); } else { blkerror(idesc->id_number, "DUP", blkno); if (dupblk++ >= MAXDUP) { pwarn("EXCESSIVE DUP BLKS I=%lu", (u_long)idesc->id_number); if (preen) printf(" (SKIPPING)\n"); else if (reply("CONTINUE") == 0) { ckfini(0); exit(EEXIT); } rerun = 1; return (STOP); } new = (struct dups *)Malloc(sizeof(struct dups)); if (new == NULL) { pfatal("DUP TABLE OVERFLOW."); if (reply("CONTINUE") == 0) { ckfini(0); exit(EEXIT); } rerun = 1; return (STOP); } new->dup = blkno; if (muldup == NULL) { duplist = muldup = new; new->next = NULL; } else { new->next = muldup->next; muldup->next = new; } for (dlp = duplist; dlp != muldup; dlp = dlp->next) if (dlp->dup == blkno) break; if (dlp == muldup && dlp->dup != blkno) muldup = new; } /* * count the number of blocks found in id_entryno */ idesc->id_entryno++; } return (res); } Index: head/sbin/fsck_ffs/pass1b.c =================================================================== --- head/sbin/fsck_ffs/pass1b.c (revision 313779) +++ head/sbin/fsck_ffs/pass1b.c (revision 313780) @@ -1,117 +1,117 @@ /* * Copyright (c) 1980, 1986, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #if 0 #ifndef lint static const char sccsid[] = "@(#)pass1b.c 8.4 (Berkeley) 4/28/95"; #endif /* not lint */ #endif #include __FBSDID("$FreeBSD$"); #include #include #include #include #include "fsck.h" static struct dups *duphead; static int pass1bcheck(struct inodesc *); void pass1b(void) { int c, i; union dinode *dp; struct inodesc idesc; ino_t inumber; memset(&idesc, 0, sizeof(struct inodesc)); idesc.id_type = ADDR; idesc.id_func = pass1bcheck; duphead = duplist; inumber = 0; for (c = 0; c < sblock.fs_ncg; c++) { if (got_siginfo) { printf("%s: phase 1b: cyl group %d of %d (%d%%)\n", cdevname, c, sblock.fs_ncg, c * 100 / sblock.fs_ncg); got_siginfo = 0; } if (got_sigalarm) { setproctitle("%s p1b %d%%", cdevname, c * 100 / sblock.fs_ncg); got_sigalarm = 0; } for (i = 0; i < sblock.fs_ipg; i++, inumber++) { - if (inumber < ROOTINO) + if (inumber < UFS_ROOTINO) continue; dp = ginode(inumber); if (dp == NULL) continue; idesc.id_number = inumber; if (inoinfo(inumber)->ino_state != USTATE && (ckinode(dp, &idesc) & STOP)) { rerun = 1; return; } } } } static int pass1bcheck(struct inodesc *idesc) { struct dups *dlp; int nfrags, res = KEEPON; ufs2_daddr_t blkno = idesc->id_blkno; for (nfrags = idesc->id_numfrags; nfrags > 0; blkno++, nfrags--) { if (chkrange(blkno, 1)) res = SKIP; for (dlp = duphead; dlp; dlp = dlp->next) { if (dlp->dup == blkno) { blkerror(idesc->id_number, "DUP", blkno); dlp->dup = duphead->dup; duphead->dup = blkno; duphead = duphead->next; } if (dlp == muldup) break; } if (muldup == NULL || duphead == muldup->next) { rerun = 1; return (STOP); } } return (res); } Index: head/sbin/fsck_ffs/pass2.c =================================================================== --- head/sbin/fsck_ffs/pass2.c (revision 313779) +++ head/sbin/fsck_ffs/pass2.c (revision 313780) @@ -1,666 +1,669 @@ /* * Copyright (c) 1980, 1986, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #if 0 #ifndef lint static const char sccsid[] = "@(#)pass2.c 8.9 (Berkeley) 4/28/95"; #endif /* not lint */ #endif #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include "fsck.h" #define MINDIRSIZE (sizeof (struct dirtemplate)) static int fix_extraneous(struct inoinfo *, struct inodesc *); static int deleteentry(struct inodesc *); static int blksort(const void *, const void *); static int pass2check(struct inodesc *); void pass2(void) { union dinode *dp; struct inoinfo **inpp, *inp; struct inoinfo **inpend; struct inodesc curino; union dinode dino; int i; char pathbuf[MAXPATHLEN + 1]; - switch (inoinfo(ROOTINO)->ino_state) { + switch (inoinfo(UFS_ROOTINO)->ino_state) { case USTATE: pfatal("ROOT INODE UNALLOCATED"); if (reply("ALLOCATE") == 0) { ckfini(0); exit(EEXIT); } - if (allocdir(ROOTINO, ROOTINO, 0755) != ROOTINO) + if (allocdir(UFS_ROOTINO, UFS_ROOTINO, 0755) != UFS_ROOTINO) errx(EEXIT, "CANNOT ALLOCATE ROOT INODE"); break; case DCLEAR: pfatal("DUPS/BAD IN ROOT INODE"); if (reply("REALLOCATE")) { - freeino(ROOTINO); - if (allocdir(ROOTINO, ROOTINO, 0755) != ROOTINO) + freeino(UFS_ROOTINO); + if (allocdir(UFS_ROOTINO, UFS_ROOTINO, 0755) != + UFS_ROOTINO) errx(EEXIT, "CANNOT ALLOCATE ROOT INODE"); break; } if (reply("CONTINUE") == 0) { ckfini(0); exit(EEXIT); } break; case FSTATE: case FCLEAR: case FZLINK: pfatal("ROOT INODE NOT DIRECTORY"); if (reply("REALLOCATE")) { - freeino(ROOTINO); - if (allocdir(ROOTINO, ROOTINO, 0755) != ROOTINO) + freeino(UFS_ROOTINO); + if (allocdir(UFS_ROOTINO, UFS_ROOTINO, 0755) != + UFS_ROOTINO) errx(EEXIT, "CANNOT ALLOCATE ROOT INODE"); break; } if (reply("FIX") == 0) { ckfini(0); exit(EEXIT); } - dp = ginode(ROOTINO); + dp = ginode(UFS_ROOTINO); DIP_SET(dp, di_mode, DIP(dp, di_mode) & ~IFMT); DIP_SET(dp, di_mode, DIP(dp, di_mode) | IFDIR); inodirty(); break; case DSTATE: case DZLINK: break; default: errx(EEXIT, "BAD STATE %d FOR ROOT INODE", - inoinfo(ROOTINO)->ino_state); + inoinfo(UFS_ROOTINO)->ino_state); } - inoinfo(ROOTINO)->ino_state = DFOUND; - inoinfo(WINO)->ino_state = FSTATE; - inoinfo(WINO)->ino_type = DT_WHT; + inoinfo(UFS_ROOTINO)->ino_state = DFOUND; + inoinfo(UFS_WINO)->ino_state = FSTATE; + inoinfo(UFS_WINO)->ino_type = DT_WHT; /* * Sort the directory list into disk block order. */ qsort((char *)inpsort, (size_t)inplast, sizeof *inpsort, blksort); /* * Check the integrity of each directory. */ memset(&curino, 0, sizeof(struct inodesc)); curino.id_type = DATA; curino.id_func = pass2check; inpend = &inpsort[inplast]; for (inpp = inpsort; inpp < inpend; inpp++) { if (got_siginfo) { printf("%s: phase 2: dir %td of %d (%d%%)\n", cdevname, inpp - inpsort, (int)inplast, (int)((inpp - inpsort) * 100 / inplast)); got_siginfo = 0; } if (got_sigalarm) { setproctitle("%s p2 %d%%", cdevname, (int)((inpp - inpsort) * 100 / inplast)); got_sigalarm = 0; } inp = *inpp; if (inp->i_isize == 0) continue; if (inp->i_isize < MINDIRSIZE) { direrror(inp->i_number, "DIRECTORY TOO SHORT"); inp->i_isize = roundup(MINDIRSIZE, DIRBLKSIZ); if (reply("FIX") == 1) { dp = ginode(inp->i_number); DIP_SET(dp, di_size, inp->i_isize); inodirty(); } } else if ((inp->i_isize & (DIRBLKSIZ - 1)) != 0) { getpathname(pathbuf, inp->i_number, inp->i_number); if (usedsoftdep) pfatal("%s %s: LENGTH %jd NOT MULTIPLE OF %d", "DIRECTORY", pathbuf, (intmax_t)inp->i_isize, DIRBLKSIZ); else pwarn("%s %s: LENGTH %jd NOT MULTIPLE OF %d", "DIRECTORY", pathbuf, (intmax_t)inp->i_isize, DIRBLKSIZ); if (preen) printf(" (ADJUSTED)\n"); inp->i_isize = roundup(inp->i_isize, DIRBLKSIZ); if (preen || reply("ADJUST") == 1) { dp = ginode(inp->i_number); DIP_SET(dp, di_size, roundup(inp->i_isize, DIRBLKSIZ)); inodirty(); } } dp = &dino; memset(dp, 0, sizeof(struct ufs2_dinode)); DIP_SET(dp, di_mode, IFDIR); DIP_SET(dp, di_size, inp->i_isize); - for (i = 0; i < MIN(inp->i_numblks, NDADDR); i++) + for (i = 0; i < MIN(inp->i_numblks, UFS_NDADDR); i++) DIP_SET(dp, di_db[i], inp->i_blks[i]); - if (inp->i_numblks > NDADDR) - for (i = 0; i < NIADDR; i++) - DIP_SET(dp, di_ib[i], inp->i_blks[NDADDR + i]); + if (inp->i_numblks > UFS_NDADDR) + for (i = 0; i < UFS_NIADDR; i++) + DIP_SET(dp, di_ib[i], + inp->i_blks[UFS_NDADDR + i]); curino.id_number = inp->i_number; curino.id_parent = inp->i_parent; (void)ckinode(dp, &curino); } /* * Now that the parents of all directories have been found, * make another pass to verify the value of `..' */ for (inpp = inpsort; inpp < inpend; inpp++) { inp = *inpp; if (inp->i_parent == 0 || inp->i_isize == 0) continue; if (inoinfo(inp->i_parent)->ino_state == DFOUND && INO_IS_DUNFOUND(inp->i_number)) inoinfo(inp->i_number)->ino_state = DFOUND; if (inp->i_dotdot == inp->i_parent || inp->i_dotdot == (ino_t)-1) continue; if (inp->i_dotdot == 0) { inp->i_dotdot = inp->i_parent; fileerror(inp->i_parent, inp->i_number, "MISSING '..'"); if (reply("FIX") == 0) continue; (void)makeentry(inp->i_number, inp->i_parent, ".."); inoinfo(inp->i_parent)->ino_linkcnt--; continue; } /* * Here we have: * inp->i_number is directory with bad ".." in it. * inp->i_dotdot is current value of "..". * inp->i_parent is directory to which ".." should point. */ getpathname(pathbuf, inp->i_parent, inp->i_number); printf("BAD INODE NUMBER FOR '..' in DIR I=%ju (%s)\n", (uintmax_t)inp->i_number, pathbuf); getpathname(pathbuf, inp->i_dotdot, inp->i_dotdot); printf("CURRENTLY POINTS TO I=%ju (%s), ", (uintmax_t)inp->i_dotdot, pathbuf); getpathname(pathbuf, inp->i_parent, inp->i_parent); printf("SHOULD POINT TO I=%ju (%s)", (uintmax_t)inp->i_parent, pathbuf); if (cursnapshot != 0) { /* * We need to: * setcwd(inp->i_number); * setdotdot(inp->i_dotdot, inp->i_parent); */ cmd.value = inp->i_number; if (sysctlbyname("vfs.ffs.setcwd", 0, 0, &cmd, sizeof cmd) == -1) { /* kernel lacks support for these functions */ printf(" (IGNORED)\n"); continue; } cmd.value = inp->i_dotdot; /* verify same value */ cmd.size = inp->i_parent; /* new parent */ if (sysctlbyname("vfs.ffs.setdotdot", 0, 0, &cmd, sizeof cmd) == -1) { printf(" (FIX FAILED: %s)\n", strerror(errno)); continue; } printf(" (FIXED)\n"); inoinfo(inp->i_parent)->ino_linkcnt--; inp->i_dotdot = inp->i_parent; continue; } if (preen) printf(" (FIXED)\n"); else if (reply("FIX") == 0) continue; inoinfo(inp->i_dotdot)->ino_linkcnt++; inoinfo(inp->i_parent)->ino_linkcnt--; inp->i_dotdot = inp->i_parent; (void)changeino(inp->i_number, "..", inp->i_parent); } /* * Mark all the directories that can be found from the root. */ propagate(); } static int pass2check(struct inodesc *idesc) { struct direct *dirp = idesc->id_dirp; char dirname[MAXPATHLEN + 1]; struct inoinfo *inp; int n, entrysize, ret = 0; union dinode *dp; const char *errmsg; struct direct proto; /* * check for "." */ if (dirp->d_ino > maxino) goto chk2; if (idesc->id_entryno != 0) goto chk1; if (dirp->d_ino != 0 && strcmp(dirp->d_name, ".") == 0) { if (dirp->d_ino != idesc->id_number) { direrror(idesc->id_number, "BAD INODE NUMBER FOR '.'"); dirp->d_ino = idesc->id_number; if (reply("FIX") == 1) ret |= ALTERED; } if (dirp->d_type != DT_DIR) { direrror(idesc->id_number, "BAD TYPE VALUE FOR '.'"); dirp->d_type = DT_DIR; if (reply("FIX") == 1) ret |= ALTERED; } goto chk1; } direrror(idesc->id_number, "MISSING '.'"); proto.d_ino = idesc->id_number; proto.d_type = DT_DIR; proto.d_namlen = 1; (void)strcpy(proto.d_name, "."); entrysize = DIRSIZ(0, &proto); if (dirp->d_ino != 0 && strcmp(dirp->d_name, "..") != 0) { pfatal("CANNOT FIX, FIRST ENTRY IN DIRECTORY CONTAINS %s\n", dirp->d_name); } else if (dirp->d_reclen < entrysize) { pfatal("CANNOT FIX, INSUFFICIENT SPACE TO ADD '.'\n"); } else if (dirp->d_reclen < 2 * entrysize) { proto.d_reclen = dirp->d_reclen; memmove(dirp, &proto, (size_t)entrysize); if (reply("FIX") == 1) ret |= ALTERED; } else { n = dirp->d_reclen - entrysize; proto.d_reclen = entrysize; memmove(dirp, &proto, (size_t)entrysize); idesc->id_entryno++; inoinfo(dirp->d_ino)->ino_linkcnt--; dirp = (struct direct *)((char *)(dirp) + entrysize); memset(dirp, 0, (size_t)n); dirp->d_reclen = n; if (reply("FIX") == 1) ret |= ALTERED; } chk1: if (idesc->id_entryno > 1) goto chk2; inp = getinoinfo(idesc->id_number); proto.d_ino = inp->i_parent; proto.d_type = DT_DIR; proto.d_namlen = 2; (void)strcpy(proto.d_name, ".."); entrysize = DIRSIZ(0, &proto); if (idesc->id_entryno == 0) { n = DIRSIZ(0, dirp); if (dirp->d_reclen < n + entrysize) goto chk2; proto.d_reclen = dirp->d_reclen - n; dirp->d_reclen = n; idesc->id_entryno++; inoinfo(dirp->d_ino)->ino_linkcnt--; dirp = (struct direct *)((char *)(dirp) + n); memset(dirp, 0, (size_t)proto.d_reclen); dirp->d_reclen = proto.d_reclen; } if (dirp->d_ino != 0 && strcmp(dirp->d_name, "..") == 0) { inp->i_dotdot = dirp->d_ino; if (dirp->d_type != DT_DIR) { direrror(idesc->id_number, "BAD TYPE VALUE FOR '..'"); dirp->d_type = DT_DIR; if (reply("FIX") == 1) ret |= ALTERED; } goto chk2; } if (dirp->d_ino != 0 && strcmp(dirp->d_name, ".") != 0) { fileerror(inp->i_parent, idesc->id_number, "MISSING '..'"); pfatal("CANNOT FIX, SECOND ENTRY IN DIRECTORY CONTAINS %s\n", dirp->d_name); inp->i_dotdot = (ino_t)-1; } else if (dirp->d_reclen < entrysize) { fileerror(inp->i_parent, idesc->id_number, "MISSING '..'"); pfatal("CANNOT FIX, INSUFFICIENT SPACE TO ADD '..'\n"); inp->i_dotdot = (ino_t)-1; } else if (inp->i_parent != 0) { /* * We know the parent, so fix now. */ inp->i_dotdot = inp->i_parent; fileerror(inp->i_parent, idesc->id_number, "MISSING '..'"); proto.d_reclen = dirp->d_reclen; memmove(dirp, &proto, (size_t)entrysize); if (reply("FIX") == 1) ret |= ALTERED; } idesc->id_entryno++; if (dirp->d_ino != 0) inoinfo(dirp->d_ino)->ino_linkcnt--; return (ret|KEEPON); chk2: if (dirp->d_ino == 0) return (ret|KEEPON); if (dirp->d_namlen <= 2 && dirp->d_name[0] == '.' && idesc->id_entryno >= 2) { if (dirp->d_namlen == 1) { direrror(idesc->id_number, "EXTRA '.' ENTRY"); dirp->d_ino = 0; if (reply("FIX") == 1) ret |= ALTERED; return (KEEPON | ret); } if (dirp->d_name[1] == '.') { direrror(idesc->id_number, "EXTRA '..' ENTRY"); dirp->d_ino = 0; if (reply("FIX") == 1) ret |= ALTERED; return (KEEPON | ret); } } idesc->id_entryno++; n = 0; if (dirp->d_ino > maxino) { fileerror(idesc->id_number, dirp->d_ino, "I OUT OF RANGE"); n = reply("REMOVE"); - } else if (((dirp->d_ino == WINO && dirp->d_type != DT_WHT) || - (dirp->d_ino != WINO && dirp->d_type == DT_WHT))) { + } else if (((dirp->d_ino == UFS_WINO && dirp->d_type != DT_WHT) || + (dirp->d_ino != UFS_WINO && dirp->d_type == DT_WHT))) { fileerror(idesc->id_number, dirp->d_ino, "BAD WHITEOUT ENTRY"); - dirp->d_ino = WINO; + dirp->d_ino = UFS_WINO; dirp->d_type = DT_WHT; if (reply("FIX") == 1) ret |= ALTERED; } else { again: switch (inoinfo(dirp->d_ino)->ino_state) { case USTATE: if (idesc->id_entryno <= 2) break; fileerror(idesc->id_number, dirp->d_ino, "UNALLOCATED"); n = reply("REMOVE"); break; case DCLEAR: case FCLEAR: if (idesc->id_entryno <= 2) break; if (inoinfo(dirp->d_ino)->ino_state == FCLEAR) errmsg = "DUP/BAD"; else if (!preen && !usedsoftdep) errmsg = "ZERO LENGTH DIRECTORY"; else if (cursnapshot == 0) { n = 1; break; } else { getpathname(dirname, idesc->id_number, dirp->d_ino); pwarn("ZERO LENGTH DIRECTORY %s I=%ju", dirname, (uintmax_t)dirp->d_ino); /* * We need to: * setcwd(idesc->id_parent); * rmdir(dirp->d_name); */ cmd.value = idesc->id_number; if (sysctlbyname("vfs.ffs.setcwd", 0, 0, &cmd, sizeof cmd) == -1) { /* kernel lacks support */ printf(" (IGNORED)\n"); n = 1; break; } if (rmdir(dirp->d_name) == -1) { printf(" (REMOVAL FAILED: %s)\n", strerror(errno)); n = 1; break; } /* ".." reference to parent is removed */ inoinfo(idesc->id_number)->ino_linkcnt--; printf(" (REMOVED)\n"); break; } fileerror(idesc->id_number, dirp->d_ino, errmsg); if ((n = reply("REMOVE")) == 1) break; dp = ginode(dirp->d_ino); inoinfo(dirp->d_ino)->ino_state = (DIP(dp, di_mode) & IFMT) == IFDIR ? DSTATE : FSTATE; inoinfo(dirp->d_ino)->ino_linkcnt = DIP(dp, di_nlink); goto again; case DSTATE: case DZLINK: if (inoinfo(idesc->id_number)->ino_state == DFOUND) inoinfo(dirp->d_ino)->ino_state = DFOUND; /* FALLTHROUGH */ case DFOUND: inp = getinoinfo(dirp->d_ino); if (idesc->id_entryno > 2) { if (inp->i_parent == 0) inp->i_parent = idesc->id_number; else if ((n = fix_extraneous(inp, idesc)) == 1) break; } /* FALLTHROUGH */ case FSTATE: case FZLINK: if (dirp->d_type != inoinfo(dirp->d_ino)->ino_type) { fileerror(idesc->id_number, dirp->d_ino, "BAD TYPE VALUE"); dirp->d_type = inoinfo(dirp->d_ino)->ino_type; if (reply("FIX") == 1) ret |= ALTERED; } inoinfo(dirp->d_ino)->ino_linkcnt--; break; default: errx(EEXIT, "BAD STATE %d FOR INODE I=%ju", inoinfo(dirp->d_ino)->ino_state, (uintmax_t)dirp->d_ino); } } if (n == 0) return (ret|KEEPON); dirp->d_ino = 0; return (ret|KEEPON|ALTERED); } static int fix_extraneous(struct inoinfo *inp, struct inodesc *idesc) { char *cp; struct inodesc dotdesc; char oldname[MAXPATHLEN + 1]; char newname[MAXPATHLEN + 1]; /* * If we have not yet found "..", look it up now so we know * which inode the directory itself believes is its parent. */ if (inp->i_dotdot == 0) { memset(&dotdesc, 0, sizeof(struct inodesc)); dotdesc.id_type = DATA; dotdesc.id_number = idesc->id_dirp->d_ino; dotdesc.id_func = findino; dotdesc.id_name = strdup(".."); if ((ckinode(ginode(dotdesc.id_number), &dotdesc) & FOUND)) inp->i_dotdot = dotdesc.id_parent; } /* * We have the previously found old name (inp->i_parent) and the * just found new name (idesc->id_number). We have five cases: * 1) ".." is missing - can remove either name, choose to delete * new one and let fsck create ".." pointing to old name. * 2) Both new and old are in same directory, choose to delete * the new name and let fsck fix ".." if it is wrong. * 3) ".." does not point to the new name, so delete it and let * fsck fix ".." to point to the old one if it is wrong. * 4) ".." points to the old name only, so delete the new one. * 5) ".." points to the new name only, so delete the old one. * * For cases 1-4 we eliminate the new name; * for case 5 we eliminate the old name. */ if (inp->i_dotdot == 0 || /* Case 1 */ idesc->id_number == inp->i_parent || /* Case 2 */ inp->i_dotdot != idesc->id_number || /* Case 3 */ inp->i_dotdot == inp->i_parent) { /* Case 4 */ getpathname(newname, idesc->id_number, idesc->id_number); if (strcmp(newname, "/") != 0) strcat (newname, "/"); strcat(newname, idesc->id_dirp->d_name); getpathname(oldname, inp->i_number, inp->i_number); pwarn("%s IS AN EXTRANEOUS HARD LINK TO DIRECTORY %s", newname, oldname); if (cursnapshot != 0) { /* * We need to * setcwd(idesc->id_number); * unlink(idesc->id_dirp->d_name); */ cmd.value = idesc->id_number; if (sysctlbyname("vfs.ffs.setcwd", 0, 0, &cmd, sizeof cmd) == -1) { printf(" (IGNORED)\n"); return (0); } cmd.value = (intptr_t)idesc->id_dirp->d_name; cmd.size = inp->i_number; /* verify same name */ if (sysctlbyname("vfs.ffs.unlink", 0, 0, &cmd, sizeof cmd) == -1) { printf(" (UNLINK FAILED: %s)\n", strerror(errno)); return (0); } printf(" (REMOVED)\n"); return (0); } if (preen) { printf(" (REMOVED)\n"); return (1); } return (reply("REMOVE")); } /* * None of the first four cases above, so must be case (5). * Eliminate the old name and make the new the name the parent. */ getpathname(oldname, inp->i_parent, inp->i_number); getpathname(newname, inp->i_number, inp->i_number); pwarn("%s IS AN EXTRANEOUS HARD LINK TO DIRECTORY %s", oldname, newname); if (cursnapshot != 0) { /* * We need to * setcwd(inp->i_parent); * unlink(last component of oldname pathname); */ cmd.value = inp->i_parent; if (sysctlbyname("vfs.ffs.setcwd", 0, 0, &cmd, sizeof cmd) == -1) { printf(" (IGNORED)\n"); return (0); } if ((cp = strchr(oldname, '/')) == NULL) { printf(" (IGNORED)\n"); return (0); } cmd.value = (intptr_t)(cp + 1); cmd.size = inp->i_number; /* verify same name */ if (sysctlbyname("vfs.ffs.unlink", 0, 0, &cmd, sizeof cmd) == -1) { printf(" (UNLINK FAILED: %s)\n", strerror(errno)); return (0); } printf(" (REMOVED)\n"); inp->i_parent = idesc->id_number; /* reparent to correct dir */ return (0); } if (!preen && !reply("REMOVE")) return (0); memset(&dotdesc, 0, sizeof(struct inodesc)); dotdesc.id_type = DATA; dotdesc.id_number = inp->i_parent; /* directory in which name appears */ dotdesc.id_parent = inp->i_number; /* inode number in entry to delete */ dotdesc.id_func = deleteentry; if ((ckinode(ginode(dotdesc.id_number), &dotdesc) & FOUND) && preen) printf(" (REMOVED)\n"); inp->i_parent = idesc->id_number; /* reparent to correct directory */ inoinfo(inp->i_number)->ino_linkcnt++; /* name gone, return reference */ return (0); } static int deleteentry(struct inodesc *idesc) { struct direct *dirp = idesc->id_dirp; if (idesc->id_entryno++ < 2 || dirp->d_ino != idesc->id_parent) return (KEEPON); dirp->d_ino = 0; return (ALTERED|STOP|FOUND); } /* * Routine to sort disk blocks. */ static int blksort(const void *arg1, const void *arg2) { return ((*(struct inoinfo * const *)arg1)->i_blks[0] - (*(struct inoinfo * const *)arg2)->i_blks[0]); } Index: head/sbin/fsck_ffs/pass3.c =================================================================== --- head/sbin/fsck_ffs/pass3.c (revision 313779) +++ head/sbin/fsck_ffs/pass3.c (revision 313780) @@ -1,127 +1,127 @@ /* * Copyright (c) 1980, 1986, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #if 0 #ifndef lint static const char sccsid[] = "@(#)pass3.c 8.2 (Berkeley) 4/27/95"; #endif /* not lint */ #endif #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include "fsck.h" void pass3(void) { struct inoinfo *inp; int loopcnt, inpindex, state; ino_t orphan; struct inodesc idesc; char namebuf[UFS_MAXNAMLEN+1]; for (inpindex = inplast - 1; inpindex >= 0; inpindex--) { if (got_siginfo) { printf("%s: phase 3: dir %d of %d (%d%%)\n", cdevname, (int)(inplast - inpindex - 1), (int)inplast, (int)((inplast - inpindex - 1) * 100 / inplast)); got_siginfo = 0; } if (got_sigalarm) { setproctitle("%s p3 %d%%", cdevname, (int)((inplast - inpindex - 1) * 100 / inplast)); got_sigalarm = 0; } inp = inpsort[inpindex]; state = inoinfo(inp->i_number)->ino_state; - if (inp->i_number == ROOTINO || + if (inp->i_number == UFS_ROOTINO || (inp->i_parent != 0 && !S_IS_DUNFOUND(state))) continue; if (state == DCLEAR) continue; /* * If we are running with soft updates and we come * across unreferenced directories, we just leave * them in DSTATE which will cause them to be pitched * in pass 4. */ if ((preen || bkgrdflag) && resolved && usedsoftdep && S_IS_DUNFOUND(state)) { - if (inp->i_dotdot >= ROOTINO) + if (inp->i_dotdot >= UFS_ROOTINO) inoinfo(inp->i_dotdot)->ino_linkcnt++; continue; } for (loopcnt = 0; ; loopcnt++) { orphan = inp->i_number; if (inp->i_parent == 0 || !INO_IS_DUNFOUND(inp->i_parent) || loopcnt > countdirs) break; inp = getinoinfo(inp->i_parent); } if (loopcnt <= countdirs) { if (linkup(orphan, inp->i_dotdot, NULL)) { inp->i_parent = inp->i_dotdot = lfdir; inoinfo(lfdir)->ino_linkcnt--; } inoinfo(orphan)->ino_state = DFOUND; propagate(); continue; } pfatal("ORPHANED DIRECTORY LOOP DETECTED I=%lu", (u_long)orphan); if (reply("RECONNECT") == 0) continue; memset(&idesc, 0, sizeof(struct inodesc)); idesc.id_type = DATA; idesc.id_number = inp->i_parent; idesc.id_parent = orphan; idesc.id_func = findname; idesc.id_name = namebuf; if ((ckinode(ginode(inp->i_parent), &idesc) & FOUND) == 0) pfatal("COULD NOT FIND NAME IN PARENT DIRECTORY"); if (linkup(orphan, inp->i_parent, namebuf)) { idesc.id_func = clearentry; if (ckinode(ginode(inp->i_parent), &idesc) & FOUND) inoinfo(orphan)->ino_linkcnt++; inp->i_parent = inp->i_dotdot = lfdir; inoinfo(lfdir)->ino_linkcnt--; } inoinfo(orphan)->ino_state = DFOUND; propagate(); } } Index: head/sbin/fsck_ffs/pass4.c =================================================================== --- head/sbin/fsck_ffs/pass4.c (revision 313779) +++ head/sbin/fsck_ffs/pass4.c (revision 313780) @@ -1,153 +1,153 @@ /* * Copyright (c) 1980, 1986, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #if 0 #ifndef lint static const char sccsid[] = "@(#)pass4.c 8.4 (Berkeley) 4/28/95"; #endif /* not lint */ #endif #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include "fsck.h" void pass4(void) { ino_t inumber; union dinode *dp; struct inodesc idesc; int i, n, cg; memset(&idesc, 0, sizeof(struct inodesc)); idesc.id_type = ADDR; idesc.id_func = pass4check; for (cg = 0; cg < sblock.fs_ncg; cg++) { if (got_siginfo) { printf("%s: phase 4: cyl group %d of %d (%d%%)\n", cdevname, cg, sblock.fs_ncg, cg * 100 / sblock.fs_ncg); got_siginfo = 0; } if (got_sigalarm) { setproctitle("%s p4 %d%%", cdevname, cg * 100 / sblock.fs_ncg); got_sigalarm = 0; } inumber = cg * sblock.fs_ipg; for (i = 0; i < inostathead[cg].il_numalloced; i++, inumber++) { - if (inumber < ROOTINO) + if (inumber < UFS_ROOTINO) continue; idesc.id_number = inumber; switch (inoinfo(inumber)->ino_state) { case FZLINK: case DZLINK: if (inoinfo(inumber)->ino_linkcnt == 0) { clri(&idesc, "UNREF", 1); break; } /* fall through */ case FSTATE: case DFOUND: n = inoinfo(inumber)->ino_linkcnt; if (n) { adjust(&idesc, (short)n); break; } break; case DSTATE: clri(&idesc, "UNREF", 1); break; case DCLEAR: /* if on snapshot, already cleared */ if (cursnapshot != 0) break; dp = ginode(inumber); if (DIP(dp, di_size) == 0) { clri(&idesc, "ZERO LENGTH", 1); break; } /* fall through */ case FCLEAR: clri(&idesc, "BAD/DUP", 1); break; case USTATE: break; default: errx(EEXIT, "BAD STATE %d FOR INODE I=%ju", inoinfo(inumber)->ino_state, (uintmax_t)inumber); } } } } int pass4check(struct inodesc *idesc) { struct dups *dlp; int nfrags, res = KEEPON; ufs2_daddr_t blkno = idesc->id_blkno; for (nfrags = idesc->id_numfrags; nfrags > 0; blkno++, nfrags--) { if (chkrange(blkno, 1)) { res = SKIP; } else if (testbmap(blkno)) { for (dlp = duplist; dlp; dlp = dlp->next) { if (dlp->dup != blkno) continue; dlp->dup = duplist->dup; dlp = duplist; duplist = duplist->next; free((char *)dlp); break; } if (dlp == NULL) { clrbmap(blkno); n_blks--; } } } return (res); } Index: head/sbin/fsck_ffs/pass5.c =================================================================== --- head/sbin/fsck_ffs/pass5.c (revision 313779) +++ head/sbin/fsck_ffs/pass5.c (revision 313780) @@ -1,596 +1,596 @@ /* * Copyright (c) 1980, 1986, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #if 0 #ifndef lint static const char sccsid[] = "@(#)pass5.c 8.9 (Berkeley) 4/28/95"; #endif /* not lint */ #endif #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include "fsck.h" static void check_maps(u_char *, u_char *, int, ufs2_daddr_t, const char *, int *, int, int, int); static void clear_blocks(ufs2_daddr_t start, ufs2_daddr_t end); void pass5(void) { int c, i, j, blk, frags, basesize, mapsize; int inomapsize, blkmapsize; struct fs *fs = &sblock; ufs2_daddr_t d, dbase, dmax, start; int rewritecg = 0; struct csum *cs; struct csum_total cstotal; struct inodesc idesc[3]; char buf[MAXBSIZE]; struct cg *cg, *newcg = (struct cg *)buf; struct bufarea *cgbp; - inoinfo(WINO)->ino_state = USTATE; + inoinfo(UFS_WINO)->ino_state = USTATE; memset(newcg, 0, (size_t)fs->fs_cgsize); newcg->cg_niblk = fs->fs_ipg; if (cvtlevel >= 3) { if (fs->fs_maxcontig < 2 && fs->fs_contigsumsize > 0) { if (preen) pwarn("DELETING CLUSTERING MAPS\n"); if (preen || reply("DELETE CLUSTERING MAPS")) { fs->fs_contigsumsize = 0; rewritecg = 1; sbdirty(); } } if (fs->fs_maxcontig > 1) { const char *doit = NULL; if (fs->fs_contigsumsize < 1) { doit = "CREAT"; } else if (fs->fs_contigsumsize < fs->fs_maxcontig && fs->fs_contigsumsize < FS_MAXCONTIG) { doit = "EXPAND"; } if (doit) { i = fs->fs_contigsumsize; fs->fs_contigsumsize = MIN(fs->fs_maxcontig, FS_MAXCONTIG); if (CGSIZE(fs) > (u_int)fs->fs_bsize) { pwarn("CANNOT %s CLUSTER MAPS\n", doit); fs->fs_contigsumsize = i; } else if (preen || reply("CREATE CLUSTER MAPS")) { if (preen) pwarn("%sING CLUSTER MAPS\n", doit); fs->fs_cgsize = fragroundup(fs, CGSIZE(fs)); rewritecg = 1; sbdirty(); } } } } basesize = &newcg->cg_space[0] - (u_char *)(&newcg->cg_firstfield); if (sblock.fs_magic == FS_UFS2_MAGIC) { newcg->cg_iusedoff = basesize; } else { /* * We reserve the space for the old rotation summary * tables for the benefit of old kernels, but do not * maintain them in modern kernels. In time, they can * go away. */ newcg->cg_old_btotoff = basesize; newcg->cg_old_boff = newcg->cg_old_btotoff + fs->fs_old_cpg * sizeof(int32_t); newcg->cg_iusedoff = newcg->cg_old_boff + fs->fs_old_cpg * fs->fs_old_nrpos * sizeof(u_int16_t); memset(&newcg->cg_space[0], 0, newcg->cg_iusedoff - basesize); } inomapsize = howmany(fs->fs_ipg, CHAR_BIT); newcg->cg_freeoff = newcg->cg_iusedoff + inomapsize; blkmapsize = howmany(fs->fs_fpg, CHAR_BIT); newcg->cg_nextfreeoff = newcg->cg_freeoff + blkmapsize; if (fs->fs_contigsumsize > 0) { newcg->cg_clustersumoff = newcg->cg_nextfreeoff - sizeof(u_int32_t); newcg->cg_clustersumoff = roundup(newcg->cg_clustersumoff, sizeof(u_int32_t)); newcg->cg_clusteroff = newcg->cg_clustersumoff + (fs->fs_contigsumsize + 1) * sizeof(u_int32_t); newcg->cg_nextfreeoff = newcg->cg_clusteroff + howmany(fragstoblks(fs, fs->fs_fpg), CHAR_BIT); } newcg->cg_magic = CG_MAGIC; mapsize = newcg->cg_nextfreeoff - newcg->cg_iusedoff; memset(&idesc[0], 0, sizeof idesc); for (i = 0; i < 3; i++) idesc[i].id_type = ADDR; memset(&cstotal, 0, sizeof(struct csum_total)); dmax = blknum(fs, fs->fs_size + fs->fs_frag - 1); for (d = fs->fs_size; d < dmax; d++) setbmap(d); for (c = 0; c < fs->fs_ncg; c++) { if (got_siginfo) { printf("%s: phase 5: cyl group %d of %d (%d%%)\n", cdevname, c, sblock.fs_ncg, c * 100 / sblock.fs_ncg); got_siginfo = 0; } if (got_sigalarm) { setproctitle("%s p5 %d%%", cdevname, c * 100 / sblock.fs_ncg); got_sigalarm = 0; } cgbp = cgget(c); cg = cgbp->b_un.b_cg; if (!cg_chkmagic(cg)) pfatal("CG %d: BAD MAGIC NUMBER\n", c); newcg->cg_time = cg->cg_time; newcg->cg_old_time = cg->cg_old_time; newcg->cg_unrefs = cg->cg_unrefs; newcg->cg_cgx = c; dbase = cgbase(fs, c); dmax = dbase + fs->fs_fpg; if (dmax > fs->fs_size) dmax = fs->fs_size; newcg->cg_ndblk = dmax - dbase; if (fs->fs_magic == FS_UFS1_MAGIC) { if (c == fs->fs_ncg - 1) newcg->cg_old_ncyl = howmany(newcg->cg_ndblk, fs->fs_fpg / fs->fs_old_cpg); else newcg->cg_old_ncyl = fs->fs_old_cpg; newcg->cg_old_niblk = fs->fs_ipg; newcg->cg_niblk = 0; } if (fs->fs_contigsumsize > 0) newcg->cg_nclusterblks = newcg->cg_ndblk / fs->fs_frag; newcg->cg_cs.cs_ndir = 0; newcg->cg_cs.cs_nffree = 0; newcg->cg_cs.cs_nbfree = 0; newcg->cg_cs.cs_nifree = fs->fs_ipg; if (cg->cg_rotor >= 0 && cg->cg_rotor < newcg->cg_ndblk) newcg->cg_rotor = cg->cg_rotor; else newcg->cg_rotor = 0; if (cg->cg_frotor >= 0 && cg->cg_frotor < newcg->cg_ndblk) newcg->cg_frotor = cg->cg_frotor; else newcg->cg_frotor = 0; if (cg->cg_irotor >= 0 && cg->cg_irotor < fs->fs_ipg) newcg->cg_irotor = cg->cg_irotor; else newcg->cg_irotor = 0; if (fs->fs_magic == FS_UFS1_MAGIC) { newcg->cg_initediblk = 0; } else { if ((unsigned)cg->cg_initediblk > fs->fs_ipg) newcg->cg_initediblk = fs->fs_ipg; else newcg->cg_initediblk = cg->cg_initediblk; } memset(&newcg->cg_frsum[0], 0, sizeof newcg->cg_frsum); memset(cg_inosused(newcg), 0, (size_t)(mapsize)); j = fs->fs_ipg * c; for (i = 0; i < inostathead[c].il_numalloced; j++, i++) { switch (inoinfo(j)->ino_state) { case USTATE: break; case DSTATE: case DCLEAR: case DFOUND: case DZLINK: newcg->cg_cs.cs_ndir++; /* FALLTHROUGH */ case FSTATE: case FCLEAR: case FZLINK: newcg->cg_cs.cs_nifree--; setbit(cg_inosused(newcg), i); break; default: - if (j < (int)ROOTINO) + if (j < (int)UFS_ROOTINO) break; errx(EEXIT, "BAD STATE %d FOR INODE I=%d", inoinfo(j)->ino_state, j); } } if (c == 0) - for (i = 0; i < (int)ROOTINO; i++) { + for (i = 0; i < (int)UFS_ROOTINO; i++) { setbit(cg_inosused(newcg), i); newcg->cg_cs.cs_nifree--; } start = -1; for (i = 0, d = dbase; d < dmax; d += fs->fs_frag, i += fs->fs_frag) { frags = 0; for (j = 0; j < fs->fs_frag; j++) { if (testbmap(d + j)) { if ((Eflag || Zflag) && start != -1) { clear_blocks(start, d + j - 1); start = -1; } continue; } if (start == -1) start = d + j; setbit(cg_blksfree(newcg), i + j); frags++; } if (frags == fs->fs_frag) { newcg->cg_cs.cs_nbfree++; if (fs->fs_contigsumsize > 0) setbit(cg_clustersfree(newcg), i / fs->fs_frag); } else if (frags > 0) { newcg->cg_cs.cs_nffree += frags; blk = blkmap(fs, cg_blksfree(newcg), i); ffs_fragacct(fs, blk, newcg->cg_frsum, 1); } } if ((Eflag || Zflag) && start != -1) clear_blocks(start, d - 1); if (fs->fs_contigsumsize > 0) { int32_t *sump = cg_clustersum(newcg); u_char *mapp = cg_clustersfree(newcg); int map = *mapp++; int bit = 1; int run = 0; for (i = 0; i < newcg->cg_nclusterblks; i++) { if ((map & bit) != 0) { run++; } else if (run != 0) { if (run > fs->fs_contigsumsize) run = fs->fs_contigsumsize; sump[run]++; run = 0; } if ((i & (CHAR_BIT - 1)) != (CHAR_BIT - 1)) { bit <<= 1; } else { map = *mapp++; bit = 1; } } if (run != 0) { if (run > fs->fs_contigsumsize) run = fs->fs_contigsumsize; sump[run]++; } } if (bkgrdflag != 0) { cstotal.cs_nffree += cg->cg_cs.cs_nffree; cstotal.cs_nbfree += cg->cg_cs.cs_nbfree; cstotal.cs_nifree += cg->cg_cs.cs_nifree; cstotal.cs_ndir += cg->cg_cs.cs_ndir; } else { cstotal.cs_nffree += newcg->cg_cs.cs_nffree; cstotal.cs_nbfree += newcg->cg_cs.cs_nbfree; cstotal.cs_nifree += newcg->cg_cs.cs_nifree; cstotal.cs_ndir += newcg->cg_cs.cs_ndir; } cs = &fs->fs_cs(fs, c); if (cursnapshot == 0 && memcmp(&newcg->cg_cs, cs, sizeof *cs) != 0 && dofix(&idesc[0], "FREE BLK COUNT(S) WRONG IN SUPERBLK")) { memmove(cs, &newcg->cg_cs, sizeof *cs); sbdirty(); } if (rewritecg) { memmove(cg, newcg, (size_t)fs->fs_cgsize); dirty(cgbp); continue; } if (cursnapshot == 0 && memcmp(newcg, cg, basesize) != 0 && dofix(&idesc[2], "SUMMARY INFORMATION BAD")) { memmove(cg, newcg, (size_t)basesize); dirty(cgbp); } if (bkgrdflag != 0 || usedsoftdep || debug) update_maps(cg, newcg, bkgrdflag); if (cursnapshot == 0 && memcmp(cg_inosused(newcg), cg_inosused(cg), mapsize) != 0 && dofix(&idesc[1], "BLK(S) MISSING IN BIT MAPS")) { memmove(cg_inosused(cg), cg_inosused(newcg), (size_t)mapsize); dirty(cgbp); } } if (cursnapshot == 0 && memcmp(&cstotal, &fs->fs_cstotal, sizeof cstotal) != 0 && dofix(&idesc[0], "SUMMARY BLK COUNT(S) WRONG IN SUPERBLK")) { memmove(&fs->fs_cstotal, &cstotal, sizeof cstotal); fs->fs_ronly = 0; fs->fs_fmod = 0; sbdirty(); } /* * When doing background fsck on a snapshot, figure out whether * the superblock summary is inaccurate and correct it when * necessary. */ if (cursnapshot != 0) { cmd.size = 1; cmd.value = cstotal.cs_ndir - fs->fs_cstotal.cs_ndir; if (cmd.value != 0) { if (debug) printf("adjndir by %+" PRIi64 "\n", cmd.value); if (bkgrdsumadj == 0 || sysctl(adjndir, MIBSIZE, 0, 0, &cmd, sizeof cmd) == -1) rwerror("ADJUST NUMBER OF DIRECTORIES", cmd.value); } cmd.value = cstotal.cs_nbfree - fs->fs_cstotal.cs_nbfree; if (cmd.value != 0) { if (debug) printf("adjnbfree by %+" PRIi64 "\n", cmd.value); if (bkgrdsumadj == 0 || sysctl(adjnbfree, MIBSIZE, 0, 0, &cmd, sizeof cmd) == -1) rwerror("ADJUST NUMBER OF FREE BLOCKS", cmd.value); } cmd.value = cstotal.cs_nifree - fs->fs_cstotal.cs_nifree; if (cmd.value != 0) { if (debug) printf("adjnifree by %+" PRIi64 "\n", cmd.value); if (bkgrdsumadj == 0 || sysctl(adjnifree, MIBSIZE, 0, 0, &cmd, sizeof cmd) == -1) rwerror("ADJUST NUMBER OF FREE INODES", cmd.value); } cmd.value = cstotal.cs_nffree - fs->fs_cstotal.cs_nffree; if (cmd.value != 0) { if (debug) printf("adjnffree by %+" PRIi64 "\n", cmd.value); if (bkgrdsumadj == 0 || sysctl(adjnffree, MIBSIZE, 0, 0, &cmd, sizeof cmd) == -1) rwerror("ADJUST NUMBER OF FREE FRAGS", cmd.value); } cmd.value = cstotal.cs_numclusters - fs->fs_cstotal.cs_numclusters; if (cmd.value != 0) { if (debug) printf("adjnumclusters by %+" PRIi64 "\n", cmd.value); if (bkgrdsumadj == 0 || sysctl(adjnumclusters, MIBSIZE, 0, 0, &cmd, sizeof cmd) == -1) rwerror("ADJUST NUMBER OF FREE CLUSTERS", cmd.value); } } } /* * Compare the original cylinder group inode and block bitmaps with the * updated cylinder group inode and block bitmaps. Free inodes and blocks * that have been added. Complain if any previously freed inodes blocks * are now allocated. */ void update_maps( struct cg *oldcg, /* cylinder group of claimed allocations */ struct cg *newcg, /* cylinder group of determined allocations */ int usesysctl) /* 1 => use sysctl interface to update maps */ { int inomapsize, excessdirs; struct fs *fs = &sblock; inomapsize = howmany(fs->fs_ipg, CHAR_BIT); excessdirs = oldcg->cg_cs.cs_ndir - newcg->cg_cs.cs_ndir; if (excessdirs < 0) { pfatal("LOST %d DIRECTORIES\n", -excessdirs); excessdirs = 0; } if (excessdirs > 0) check_maps(cg_inosused(newcg), cg_inosused(oldcg), inomapsize, oldcg->cg_cgx * (ufs2_daddr_t)fs->fs_ipg, "DIR", freedirs, 0, excessdirs, usesysctl); check_maps(cg_inosused(newcg), cg_inosused(oldcg), inomapsize, oldcg->cg_cgx * (ufs2_daddr_t)fs->fs_ipg, "FILE", freefiles, excessdirs, fs->fs_ipg, usesysctl); check_maps(cg_blksfree(oldcg), cg_blksfree(newcg), howmany(fs->fs_fpg, CHAR_BIT), oldcg->cg_cgx * (ufs2_daddr_t)fs->fs_fpg, "FRAG", freeblks, 0, fs->fs_fpg, usesysctl); } static void check_maps( u_char *map1, /* map of claimed allocations */ u_char *map2, /* map of determined allocations */ int mapsize, /* size of above two maps */ ufs2_daddr_t startvalue, /* resource value for first element in map */ const char *name, /* name of resource found in maps */ int *opcode, /* sysctl opcode to free resource */ int skip, /* number of entries to skip before starting to free */ int limit, /* limit on number of entries to free */ int usesysctl) /* 1 => use sysctl interface to update maps */ { # define BUFSIZE 16 char buf[BUFSIZE]; long i, j, k, l, m, size; ufs2_daddr_t n, astart, aend, ustart, uend; void (*msg)(const char *fmt, ...); if (usesysctl) msg = pfatal; else msg = pwarn; astart = ustart = aend = uend = -1; for (i = 0; i < mapsize; i++) { j = *map1++; k = *map2++; if (j == k) continue; for (m = 0, l = 1; m < CHAR_BIT; m++, l <<= 1) { if ((j & l) == (k & l)) continue; n = startvalue + i * CHAR_BIT + m; if ((j & l) != 0) { if (astart == -1) { astart = aend = n; continue; } if (aend + 1 == n) { aend = n; continue; } if (astart == aend) (*msg)("ALLOCATED %s %" PRId64 " MARKED FREE\n", name, astart); else (*msg)("%s %sS %" PRId64 "-%" PRId64 " MARKED FREE\n", "ALLOCATED", name, astart, aend); astart = aend = n; } else { if (ustart == -1) { ustart = uend = n; continue; } if (uend + 1 == n) { uend = n; continue; } size = uend - ustart + 1; if (size <= skip) { skip -= size; ustart = uend = n; continue; } if (skip > 0) { ustart += skip; size -= skip; skip = 0; } if (size > limit) size = limit; if (debug && size == 1) pwarn("%s %s %" PRId64 " MARKED USED\n", "UNALLOCATED", name, ustart); else if (debug) pwarn("%s %sS %" PRId64 "-%" PRId64 " MARKED USED\n", "UNALLOCATED", name, ustart, ustart + size - 1); if (usesysctl != 0) { cmd.value = ustart; cmd.size = size; if (sysctl(opcode, MIBSIZE, 0, 0, &cmd, sizeof cmd) == -1) { snprintf(buf, BUFSIZE, "FREE %s", name); rwerror(buf, cmd.value); } } limit -= size; if (limit <= 0) return; ustart = uend = n; } } } if (astart != -1) { if (astart == aend) (*msg)("ALLOCATED %s %" PRId64 " MARKED FREE\n", name, astart); else (*msg)("ALLOCATED %sS %" PRId64 "-%" PRId64 " MARKED FREE\n", name, astart, aend); } if (ustart != -1) { size = uend - ustart + 1; if (size <= skip) return; if (skip > 0) { ustart += skip; size -= skip; } if (size > limit) size = limit; if (debug) { if (size == 1) pwarn("UNALLOCATED %s %" PRId64 " MARKED USED\n", name, ustart); else pwarn("UNALLOCATED %sS %" PRId64 "-%" PRId64 " MARKED USED\n", name, ustart, ustart + size - 1); } if (usesysctl != 0) { cmd.value = ustart; cmd.size = size; if (sysctl(opcode, MIBSIZE, 0, 0, &cmd, sizeof cmd) == -1) { snprintf(buf, BUFSIZE, "FREE %s", name); rwerror(buf, cmd.value); } } } } static void clear_blocks(ufs2_daddr_t start, ufs2_daddr_t end) { if (debug) printf("Zero frags %jd to %jd\n", start, end); if (Zflag) blzero(fswritefd, fsbtodb(&sblock, start), lfragtosize(&sblock, end - start + 1)); if (Eflag) blerase(fswritefd, fsbtodb(&sblock, start), lfragtosize(&sblock, end - start + 1)); } Index: head/sbin/fsck_ffs/suj.c =================================================================== --- head/sbin/fsck_ffs/suj.c (revision 313779) +++ head/sbin/fsck_ffs/suj.c (revision 313780) @@ -1,2801 +1,2802 @@ /*- * Copyright 2009, 2010 Jeffrey W. Roberson * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "fsck.h" #define DOTDOT_OFFSET DIRECTSIZ(1) #define SUJ_HASHSIZE 2048 #define SUJ_HASHMASK (SUJ_HASHSIZE - 1) #define SUJ_HASH(x) ((x * 2654435761) & SUJ_HASHMASK) struct suj_seg { TAILQ_ENTRY(suj_seg) ss_next; struct jsegrec ss_rec; uint8_t *ss_blk; }; struct suj_rec { TAILQ_ENTRY(suj_rec) sr_next; union jrec *sr_rec; }; TAILQ_HEAD(srechd, suj_rec); struct suj_ino { LIST_ENTRY(suj_ino) si_next; struct srechd si_recs; struct srechd si_newrecs; struct srechd si_movs; struct jtrncrec *si_trunc; ino_t si_ino; char si_skipparent; char si_hasrecs; char si_blkadj; char si_linkadj; int si_mode; nlink_t si_nlinkadj; nlink_t si_nlink; nlink_t si_dotlinks; }; LIST_HEAD(inohd, suj_ino); struct suj_blk { LIST_ENTRY(suj_blk) sb_next; struct srechd sb_recs; ufs2_daddr_t sb_blk; }; LIST_HEAD(blkhd, suj_blk); struct data_blk { LIST_ENTRY(data_blk) db_next; uint8_t *db_buf; ufs2_daddr_t db_blk; int db_size; int db_dirty; }; struct ino_blk { LIST_ENTRY(ino_blk) ib_next; uint8_t *ib_buf; int ib_dirty; ufs2_daddr_t ib_blk; }; LIST_HEAD(iblkhd, ino_blk); struct suj_cg { LIST_ENTRY(suj_cg) sc_next; struct blkhd sc_blkhash[SUJ_HASHSIZE]; struct inohd sc_inohash[SUJ_HASHSIZE]; struct iblkhd sc_iblkhash[SUJ_HASHSIZE]; struct ino_blk *sc_lastiblk; struct suj_ino *sc_lastino; struct suj_blk *sc_lastblk; uint8_t *sc_cgbuf; struct cg *sc_cgp; int sc_dirty; int sc_cgx; }; static LIST_HEAD(cghd, suj_cg) cghash[SUJ_HASHSIZE]; static LIST_HEAD(dblkhd, data_blk) dbhash[SUJ_HASHSIZE]; static struct suj_cg *lastcg; static struct data_blk *lastblk; static TAILQ_HEAD(seghd, suj_seg) allsegs; static uint64_t oldseq; static struct uufsd *disk = NULL; static struct fs *fs = NULL; static ino_t sujino; /* * Summary statistics. */ static uint64_t freefrags; static uint64_t freeblocks; static uint64_t freeinos; static uint64_t freedir; static uint64_t jbytes; static uint64_t jrecs; static jmp_buf jmpbuf; typedef void (*ino_visitor)(ino_t, ufs_lbn_t, ufs2_daddr_t, int); static void err_suj(const char *, ...) __dead2; static void ino_trunc(ino_t, off_t); static void ino_decr(ino_t); static void ino_adjust(struct suj_ino *); static void ino_build(struct suj_ino *); static int blk_isfree(ufs2_daddr_t); static void initsuj(void); static void * errmalloc(size_t n) { void *a; a = Malloc(n); if (a == NULL) err(EX_OSERR, "malloc(%zu)", n); return (a); } /* * When hit a fatal error in journalling check, print out * the error and then offer to fallback to normal fsck. */ static void err_suj(const char * restrict fmt, ...) { va_list ap; if (preen) (void)fprintf(stdout, "%s: ", cdevname); va_start(ap, fmt); (void)vfprintf(stdout, fmt, ap); va_end(ap); longjmp(jmpbuf, -1); } /* * Open the given provider, load superblock. */ static void opendisk(const char *devnam) { if (disk != NULL) return; disk = Malloc(sizeof(*disk)); if (disk == NULL) err(EX_OSERR, "malloc(%zu)", sizeof(*disk)); if (ufs_disk_fillout(disk, devnam) == -1) { err(EX_OSERR, "ufs_disk_fillout(%s) failed: %s", devnam, disk->d_error); } fs = &disk->d_fs; if (real_dev_bsize == 0 && ioctl(disk->d_fd, DIOCGSECTORSIZE, &real_dev_bsize) == -1) real_dev_bsize = secsize; if (debug) printf("dev_bsize %u\n", real_dev_bsize); } /* * Mark file system as clean, write the super-block back, close the disk. */ static void closedisk(const char *devnam) { struct csum *cgsum; uint32_t i; /* * Recompute the fs summary info from correct cs summaries. */ bzero(&fs->fs_cstotal, sizeof(struct csum_total)); for (i = 0; i < fs->fs_ncg; i++) { cgsum = &fs->fs_cs(fs, i); fs->fs_cstotal.cs_nffree += cgsum->cs_nffree; fs->fs_cstotal.cs_nbfree += cgsum->cs_nbfree; fs->fs_cstotal.cs_nifree += cgsum->cs_nifree; fs->fs_cstotal.cs_ndir += cgsum->cs_ndir; } fs->fs_pendinginodes = 0; fs->fs_pendingblocks = 0; fs->fs_clean = 1; fs->fs_time = time(NULL); fs->fs_mtime = time(NULL); if (sbwrite(disk, 0) == -1) err(EX_OSERR, "sbwrite(%s)", devnam); if (ufs_disk_close(disk) == -1) err(EX_OSERR, "ufs_disk_close(%s)", devnam); free(disk); disk = NULL; fs = NULL; } /* * Lookup a cg by number in the hash so we can keep track of which cgs * need stats rebuilt. */ static struct suj_cg * cg_lookup(int cgx) { struct cghd *hd; struct suj_cg *sc; if (cgx < 0 || cgx >= fs->fs_ncg) err_suj("Bad cg number %d\n", cgx); if (lastcg && lastcg->sc_cgx == cgx) return (lastcg); hd = &cghash[SUJ_HASH(cgx)]; LIST_FOREACH(sc, hd, sc_next) if (sc->sc_cgx == cgx) { lastcg = sc; return (sc); } sc = errmalloc(sizeof(*sc)); bzero(sc, sizeof(*sc)); sc->sc_cgbuf = errmalloc(fs->fs_bsize); sc->sc_cgp = (struct cg *)sc->sc_cgbuf; sc->sc_cgx = cgx; LIST_INSERT_HEAD(hd, sc, sc_next); if (bread(disk, fsbtodb(fs, cgtod(fs, sc->sc_cgx)), sc->sc_cgbuf, fs->fs_bsize) == -1) err_suj("Unable to read cylinder group %d\n", sc->sc_cgx); return (sc); } /* * Lookup an inode number in the hash and allocate a suj_ino if it does * not exist. */ static struct suj_ino * ino_lookup(ino_t ino, int creat) { struct suj_ino *sino; struct inohd *hd; struct suj_cg *sc; sc = cg_lookup(ino_to_cg(fs, ino)); if (sc->sc_lastino && sc->sc_lastino->si_ino == ino) return (sc->sc_lastino); hd = &sc->sc_inohash[SUJ_HASH(ino)]; LIST_FOREACH(sino, hd, si_next) if (sino->si_ino == ino) return (sino); if (creat == 0) return (NULL); sino = errmalloc(sizeof(*sino)); bzero(sino, sizeof(*sino)); sino->si_ino = ino; TAILQ_INIT(&sino->si_recs); TAILQ_INIT(&sino->si_newrecs); TAILQ_INIT(&sino->si_movs); LIST_INSERT_HEAD(hd, sino, si_next); return (sino); } /* * Lookup a block number in the hash and allocate a suj_blk if it does * not exist. */ static struct suj_blk * blk_lookup(ufs2_daddr_t blk, int creat) { struct suj_blk *sblk; struct suj_cg *sc; struct blkhd *hd; sc = cg_lookup(dtog(fs, blk)); if (sc->sc_lastblk && sc->sc_lastblk->sb_blk == blk) return (sc->sc_lastblk); hd = &sc->sc_blkhash[SUJ_HASH(fragstoblks(fs, blk))]; LIST_FOREACH(sblk, hd, sb_next) if (sblk->sb_blk == blk) return (sblk); if (creat == 0) return (NULL); sblk = errmalloc(sizeof(*sblk)); bzero(sblk, sizeof(*sblk)); sblk->sb_blk = blk; TAILQ_INIT(&sblk->sb_recs); LIST_INSERT_HEAD(hd, sblk, sb_next); return (sblk); } static struct data_blk * dblk_lookup(ufs2_daddr_t blk) { struct data_blk *dblk; struct dblkhd *hd; hd = &dbhash[SUJ_HASH(fragstoblks(fs, blk))]; if (lastblk && lastblk->db_blk == blk) return (lastblk); LIST_FOREACH(dblk, hd, db_next) if (dblk->db_blk == blk) return (dblk); /* * The inode block wasn't located, allocate a new one. */ dblk = errmalloc(sizeof(*dblk)); bzero(dblk, sizeof(*dblk)); LIST_INSERT_HEAD(hd, dblk, db_next); dblk->db_blk = blk; return (dblk); } static uint8_t * dblk_read(ufs2_daddr_t blk, int size) { struct data_blk *dblk; dblk = dblk_lookup(blk); /* * I doubt size mismatches can happen in practice but it is trivial * to handle. */ if (size != dblk->db_size) { if (dblk->db_buf) free(dblk->db_buf); dblk->db_buf = errmalloc(size); dblk->db_size = size; if (bread(disk, fsbtodb(fs, blk), dblk->db_buf, size) == -1) err_suj("Failed to read data block %jd\n", blk); } return (dblk->db_buf); } static void dblk_dirty(ufs2_daddr_t blk) { struct data_blk *dblk; dblk = dblk_lookup(blk); dblk->db_dirty = 1; } static void dblk_write(void) { struct data_blk *dblk; int i; for (i = 0; i < SUJ_HASHSIZE; i++) { LIST_FOREACH(dblk, &dbhash[i], db_next) { if (dblk->db_dirty == 0 || dblk->db_size == 0) continue; if (bwrite(disk, fsbtodb(fs, dblk->db_blk), dblk->db_buf, dblk->db_size) == -1) err_suj("Unable to write block %jd\n", dblk->db_blk); } } } static union dinode * ino_read(ino_t ino) { struct ino_blk *iblk; struct iblkhd *hd; struct suj_cg *sc; ufs2_daddr_t blk; int off; blk = ino_to_fsba(fs, ino); sc = cg_lookup(ino_to_cg(fs, ino)); iblk = sc->sc_lastiblk; if (iblk && iblk->ib_blk == blk) goto found; hd = &sc->sc_iblkhash[SUJ_HASH(fragstoblks(fs, blk))]; LIST_FOREACH(iblk, hd, ib_next) if (iblk->ib_blk == blk) goto found; /* * The inode block wasn't located, allocate a new one. */ iblk = errmalloc(sizeof(*iblk)); bzero(iblk, sizeof(*iblk)); iblk->ib_buf = errmalloc(fs->fs_bsize); iblk->ib_blk = blk; LIST_INSERT_HEAD(hd, iblk, ib_next); if (bread(disk, fsbtodb(fs, blk), iblk->ib_buf, fs->fs_bsize) == -1) err_suj("Failed to read inode block %jd\n", blk); found: sc->sc_lastiblk = iblk; off = ino_to_fsbo(fs, ino); if (fs->fs_magic == FS_UFS1_MAGIC) return (union dinode *)&((struct ufs1_dinode *)iblk->ib_buf)[off]; else return (union dinode *)&((struct ufs2_dinode *)iblk->ib_buf)[off]; } static void ino_dirty(ino_t ino) { struct ino_blk *iblk; struct iblkhd *hd; struct suj_cg *sc; ufs2_daddr_t blk; blk = ino_to_fsba(fs, ino); sc = cg_lookup(ino_to_cg(fs, ino)); iblk = sc->sc_lastiblk; if (iblk && iblk->ib_blk == blk) { iblk->ib_dirty = 1; return; } hd = &sc->sc_iblkhash[SUJ_HASH(fragstoblks(fs, blk))]; LIST_FOREACH(iblk, hd, ib_next) { if (iblk->ib_blk == blk) { iblk->ib_dirty = 1; return; } } ino_read(ino); ino_dirty(ino); } static void iblk_write(struct ino_blk *iblk) { if (iblk->ib_dirty == 0) return; if (bwrite(disk, fsbtodb(fs, iblk->ib_blk), iblk->ib_buf, fs->fs_bsize) == -1) err_suj("Failed to write inode block %jd\n", iblk->ib_blk); } static int blk_overlaps(struct jblkrec *brec, ufs2_daddr_t start, int frags) { ufs2_daddr_t bstart; ufs2_daddr_t bend; ufs2_daddr_t end; end = start + frags; bstart = brec->jb_blkno + brec->jb_oldfrags; bend = bstart + brec->jb_frags; if (start < bend && end > bstart) return (1); return (0); } static int blk_equals(struct jblkrec *brec, ino_t ino, ufs_lbn_t lbn, ufs2_daddr_t start, int frags) { if (brec->jb_ino != ino || brec->jb_lbn != lbn) return (0); if (brec->jb_blkno + brec->jb_oldfrags != start) return (0); if (brec->jb_frags < frags) return (0); return (1); } static void blk_setmask(struct jblkrec *brec, int *mask) { int i; for (i = brec->jb_oldfrags; i < brec->jb_oldfrags + brec->jb_frags; i++) *mask |= 1 << i; } /* * Determine whether a given block has been reallocated to a new location. * Returns a mask of overlapping bits if any frags have been reused or * zero if the block has not been re-used and the contents can be trusted. * * This is used to ensure that an orphaned pointer due to truncate is safe * to be freed. The mask value can be used to free partial blocks. */ static int blk_freemask(ufs2_daddr_t blk, ino_t ino, ufs_lbn_t lbn, int frags) { struct suj_blk *sblk; struct suj_rec *srec; struct jblkrec *brec; int mask; int off; /* * To be certain we're not freeing a reallocated block we lookup * this block in the blk hash and see if there is an allocation * journal record that overlaps with any fragments in the block * we're concerned with. If any fragments have ben reallocated * the block has already been freed and re-used for another purpose. */ mask = 0; sblk = blk_lookup(blknum(fs, blk), 0); if (sblk == NULL) return (0); off = blk - sblk->sb_blk; TAILQ_FOREACH(srec, &sblk->sb_recs, sr_next) { brec = (struct jblkrec *)srec->sr_rec; /* * If the block overlaps but does not match * exactly this record refers to the current * location. */ if (blk_overlaps(brec, blk, frags) == 0) continue; if (blk_equals(brec, ino, lbn, blk, frags) == 1) mask = 0; else blk_setmask(brec, &mask); } if (debug) printf("blk_freemask: blk %jd sblk %jd off %d mask 0x%X\n", blk, sblk->sb_blk, off, mask); return (mask >> off); } /* * Determine whether it is safe to follow an indirect. It is not safe * if any part of the indirect has been reallocated or the last journal * entry was an allocation. Just allocated indirects may not have valid * pointers yet and all of their children will have their own records. * It is also not safe to follow an indirect if the cg bitmap has been * cleared as a new allocation may write to the block prior to the journal * being written. * * Returns 1 if it's safe to follow the indirect and 0 otherwise. */ static int blk_isindir(ufs2_daddr_t blk, ino_t ino, ufs_lbn_t lbn) { struct suj_blk *sblk; struct jblkrec *brec; sblk = blk_lookup(blk, 0); if (sblk == NULL) return (1); if (TAILQ_EMPTY(&sblk->sb_recs)) return (1); brec = (struct jblkrec *)TAILQ_LAST(&sblk->sb_recs, srechd)->sr_rec; if (blk_equals(brec, ino, lbn, blk, fs->fs_frag)) if (brec->jb_op == JOP_FREEBLK) return (!blk_isfree(blk)); return (0); } /* * Clear an inode from the cg bitmap. If the inode was already clear return * 0 so the caller knows it does not have to check the inode contents. */ static int ino_free(ino_t ino, int mode) { struct suj_cg *sc; uint8_t *inosused; struct cg *cgp; int cg; cg = ino_to_cg(fs, ino); ino = ino % fs->fs_ipg; sc = cg_lookup(cg); cgp = sc->sc_cgp; inosused = cg_inosused(cgp); /* * The bitmap may never have made it to the disk so we have to * conditionally clear. We can avoid writing the cg in this case. */ if (isclr(inosused, ino)) return (0); freeinos++; clrbit(inosused, ino); if (ino < cgp->cg_irotor) cgp->cg_irotor = ino; cgp->cg_cs.cs_nifree++; if ((mode & IFMT) == IFDIR) { freedir++; cgp->cg_cs.cs_ndir--; } sc->sc_dirty = 1; return (1); } /* * Free 'frags' frags starting at filesystem block 'bno' skipping any frags * set in the mask. */ static void blk_free(ufs2_daddr_t bno, int mask, int frags) { ufs1_daddr_t fragno, cgbno; struct suj_cg *sc; struct cg *cgp; int i, cg; uint8_t *blksfree; if (debug) printf("Freeing %d frags at blk %jd mask 0x%x\n", frags, bno, mask); cg = dtog(fs, bno); sc = cg_lookup(cg); cgp = sc->sc_cgp; cgbno = dtogd(fs, bno); blksfree = cg_blksfree(cgp); /* * If it's not allocated we only wrote the journal entry * and never the bitmaps. Here we unconditionally clear and * resolve the cg summary later. */ if (frags == fs->fs_frag && mask == 0) { fragno = fragstoblks(fs, cgbno); ffs_setblock(fs, blksfree, fragno); freeblocks++; } else { /* * deallocate the fragment */ for (i = 0; i < frags; i++) if ((mask & (1 << i)) == 0 && isclr(blksfree, cgbno +i)) { freefrags++; setbit(blksfree, cgbno + i); } } sc->sc_dirty = 1; } /* * Returns 1 if the whole block starting at 'bno' is marked free and 0 * otherwise. */ static int blk_isfree(ufs2_daddr_t bno) { struct suj_cg *sc; sc = cg_lookup(dtog(fs, bno)); return ffs_isblock(fs, cg_blksfree(sc->sc_cgp), dtogd(fs, bno)); } /* * Fetch an indirect block to find the block at a given lbn. The lbn * may be negative to fetch a specific indirect block pointer or positive * to fetch a specific block. */ static ufs2_daddr_t indir_blkatoff(ufs2_daddr_t blk, ino_t ino, ufs_lbn_t cur, ufs_lbn_t lbn) { ufs2_daddr_t *bap2; ufs2_daddr_t *bap1; ufs_lbn_t lbnadd; ufs_lbn_t base; int level; int i; if (blk == 0) return (0); level = lbn_level(cur); if (level == -1) err_suj("Invalid indir lbn %jd\n", lbn); if (level == 0 && lbn < 0) err_suj("Invalid lbn %jd\n", lbn); bap2 = (void *)dblk_read(blk, fs->fs_bsize); bap1 = (void *)bap2; lbnadd = 1; base = -(cur + level); for (i = level; i > 0; i--) lbnadd *= NINDIR(fs); if (lbn > 0) i = (lbn - base) / lbnadd; else i = (-lbn - base) / lbnadd; if (i < 0 || i >= NINDIR(fs)) err_suj("Invalid indirect index %d produced by lbn %jd\n", i, lbn); if (level == 0) cur = base + (i * lbnadd); else cur = -(base + (i * lbnadd)) - (level - 1); if (fs->fs_magic == FS_UFS1_MAGIC) blk = bap1[i]; else blk = bap2[i]; if (cur == lbn) return (blk); if (level == 0) err_suj("Invalid lbn %jd at level 0\n", lbn); return indir_blkatoff(blk, ino, cur, lbn); } /* * Finds the disk block address at the specified lbn within the inode * specified by ip. This follows the whole tree and honors di_size and * di_extsize so it is a true test of reachability. The lbn may be * negative if an extattr or indirect block is requested. */ static ufs2_daddr_t ino_blkatoff(union dinode *ip, ino_t ino, ufs_lbn_t lbn, int *frags) { ufs_lbn_t tmpval; ufs_lbn_t cur; ufs_lbn_t next; int i; /* * Handle extattr blocks first. */ - if (lbn < 0 && lbn >= -NXADDR) { + if (lbn < 0 && lbn >= -UFS_NXADDR) { lbn = -1 - lbn; if (lbn > lblkno(fs, ip->dp2.di_extsize - 1)) return (0); *frags = numfrags(fs, sblksize(fs, ip->dp2.di_extsize, lbn)); return (ip->dp2.di_extb[lbn]); } /* * Now direct and indirect. */ if (DIP(ip, di_mode) == IFLNK && DIP(ip, di_size) < fs->fs_maxsymlinklen) return (0); - if (lbn >= 0 && lbn < NDADDR) { + if (lbn >= 0 && lbn < UFS_NDADDR) { *frags = numfrags(fs, sblksize(fs, DIP(ip, di_size), lbn)); return (DIP(ip, di_db[lbn])); } *frags = fs->fs_frag; - for (i = 0, tmpval = NINDIR(fs), cur = NDADDR; i < NIADDR; i++, + for (i = 0, tmpval = NINDIR(fs), cur = UFS_NDADDR; i < UFS_NIADDR; i++, tmpval *= NINDIR(fs), cur = next) { next = cur + tmpval; if (lbn == -cur - i) return (DIP(ip, di_ib[i])); /* * Determine whether the lbn in question is within this tree. */ if (lbn < 0 && -lbn >= next) continue; if (lbn > 0 && lbn >= next) continue; return indir_blkatoff(DIP(ip, di_ib[i]), ino, -cur - i, lbn); } err_suj("lbn %jd not in ino\n", lbn); /* NOTREACHED */ } /* * Determine whether a block exists at a particular lbn in an inode. * Returns 1 if found, 0 if not. lbn may be negative for indirects * or ext blocks. */ static int blk_isat(ino_t ino, ufs_lbn_t lbn, ufs2_daddr_t blk, int *frags) { union dinode *ip; ufs2_daddr_t nblk; ip = ino_read(ino); if (DIP(ip, di_nlink) == 0 || DIP(ip, di_mode) == 0) return (0); nblk = ino_blkatoff(ip, ino, lbn, frags); return (nblk == blk); } /* * Clear the directory entry at diroff that should point to child. Minimal * checking is done and it is assumed that this path was verified with isat. */ static void ino_clrat(ino_t parent, off_t diroff, ino_t child) { union dinode *dip; struct direct *dp; ufs2_daddr_t blk; uint8_t *block; ufs_lbn_t lbn; int blksize; int frags; int doff; if (debug) printf("Clearing inode %ju from parent %ju at offset %jd\n", (uintmax_t)child, (uintmax_t)parent, diroff); lbn = lblkno(fs, diroff); doff = blkoff(fs, diroff); dip = ino_read(parent); blk = ino_blkatoff(dip, parent, lbn, &frags); blksize = sblksize(fs, DIP(dip, di_size), lbn); block = dblk_read(blk, blksize); dp = (struct direct *)&block[doff]; if (dp->d_ino != child) errx(1, "Inode %ju does not exist in %ju at %jd", (uintmax_t)child, (uintmax_t)parent, diroff); dp->d_ino = 0; dblk_dirty(blk); /* * The actual .. reference count will already have been removed * from the parent by the .. remref record. */ } /* * Determines whether a pointer to an inode exists within a directory * at a specified offset. Returns the mode of the found entry. */ static int ino_isat(ino_t parent, off_t diroff, ino_t child, int *mode, int *isdot) { union dinode *dip; struct direct *dp; ufs2_daddr_t blk; uint8_t *block; ufs_lbn_t lbn; int blksize; int frags; int dpoff; int doff; *isdot = 0; dip = ino_read(parent); *mode = DIP(dip, di_mode); if ((*mode & IFMT) != IFDIR) { if (debug) { /* * This can happen if the parent inode * was reallocated. */ if (*mode != 0) printf("Directory %ju has bad mode %o\n", (uintmax_t)parent, *mode); else printf("Directory %ju has zero mode\n", (uintmax_t)parent); } return (0); } lbn = lblkno(fs, diroff); doff = blkoff(fs, diroff); blksize = sblksize(fs, DIP(dip, di_size), lbn); if (diroff + DIRECTSIZ(1) > DIP(dip, di_size) || doff >= blksize) { if (debug) printf("ino %ju absent from %ju due to offset %jd" " exceeding size %jd\n", (uintmax_t)child, (uintmax_t)parent, diroff, DIP(dip, di_size)); return (0); } blk = ino_blkatoff(dip, parent, lbn, &frags); if (blk <= 0) { if (debug) printf("Sparse directory %ju", (uintmax_t)parent); return (0); } block = dblk_read(blk, blksize); /* * Walk through the records from the start of the block to be * certain we hit a valid record and not some junk in the middle * of a file name. Stop when we reach or pass the expected offset. */ dpoff = rounddown(doff, DIRBLKSIZ); do { dp = (struct direct *)&block[dpoff]; if (dpoff == doff) break; if (dp->d_reclen == 0) break; dpoff += dp->d_reclen; } while (dpoff <= doff); if (dpoff > fs->fs_bsize) err_suj("Corrupt directory block in dir ino %ju\n", (uintmax_t)parent); /* Not found. */ if (dpoff != doff) { if (debug) printf("ino %ju not found in %ju, lbn %jd, dpoff %d\n", (uintmax_t)child, (uintmax_t)parent, lbn, dpoff); return (0); } /* * We found the item in question. Record the mode and whether it's * a . or .. link for the caller. */ if (dp->d_ino == child) { if (child == parent) *isdot = 1; else if (dp->d_namlen == 2 && dp->d_name[0] == '.' && dp->d_name[1] == '.') *isdot = 1; *mode = DTTOIF(dp->d_type); return (1); } if (debug) printf("ino %ju doesn't match dirent ino %ju in parent %ju\n", (uintmax_t)child, (uintmax_t)dp->d_ino, (uintmax_t)parent); return (0); } #define VISIT_INDIR 0x0001 #define VISIT_EXT 0x0002 #define VISIT_ROOT 0x0004 /* Operation came via root & valid pointers. */ /* * Read an indirect level which may or may not be linked into an inode. */ static void indir_visit(ino_t ino, ufs_lbn_t lbn, ufs2_daddr_t blk, uint64_t *frags, ino_visitor visitor, int flags) { ufs2_daddr_t *bap2; ufs1_daddr_t *bap1; ufs_lbn_t lbnadd; ufs2_daddr_t nblk; ufs_lbn_t nlbn; int level; int i; /* * Don't visit indirect blocks with contents we can't trust. This * should only happen when indir_visit() is called to complete a * truncate that never finished and not when a pointer is found via * an inode. */ if (blk == 0) return; level = lbn_level(lbn); if (level == -1) err_suj("Invalid level for lbn %jd\n", lbn); if ((flags & VISIT_ROOT) == 0 && blk_isindir(blk, ino, lbn) == 0) { if (debug) printf("blk %jd ino %ju lbn %jd(%d) is not indir.\n", blk, (uintmax_t)ino, lbn, level); goto out; } lbnadd = 1; for (i = level; i > 0; i--) lbnadd *= NINDIR(fs); bap1 = (void *)dblk_read(blk, fs->fs_bsize); bap2 = (void *)bap1; for (i = 0; i < NINDIR(fs); i++) { if (fs->fs_magic == FS_UFS1_MAGIC) nblk = *bap1++; else nblk = *bap2++; if (nblk == 0) continue; if (level == 0) { nlbn = -lbn + i * lbnadd; (*frags) += fs->fs_frag; visitor(ino, nlbn, nblk, fs->fs_frag); } else { nlbn = (lbn + 1) - (i * lbnadd); indir_visit(ino, nlbn, nblk, frags, visitor, flags); } } out: if (flags & VISIT_INDIR) { (*frags) += fs->fs_frag; visitor(ino, lbn, blk, fs->fs_frag); } } /* * Visit each block in an inode as specified by 'flags' and call a * callback function. The callback may inspect or free blocks. The * count of frags found according to the size in the file is returned. * This is not valid for sparse files but may be used to determine * the correct di_blocks for a file. */ static uint64_t ino_visit(union dinode *ip, ino_t ino, ino_visitor visitor, int flags) { ufs_lbn_t nextlbn; ufs_lbn_t tmpval; ufs_lbn_t lbn; uint64_t size; uint64_t fragcnt; int mode; int frags; int i; size = DIP(ip, di_size); mode = DIP(ip, di_mode) & IFMT; fragcnt = 0; if ((flags & VISIT_EXT) && fs->fs_magic == FS_UFS2_MAGIC && ip->dp2.di_extsize) { - for (i = 0; i < NXADDR; i++) { + for (i = 0; i < UFS_NXADDR; i++) { if (ip->dp2.di_extb[i] == 0) continue; frags = sblksize(fs, ip->dp2.di_extsize, i); frags = numfrags(fs, frags); fragcnt += frags; visitor(ino, -1 - i, ip->dp2.di_extb[i], frags); } } /* Skip datablocks for short links and devices. */ if (mode == IFBLK || mode == IFCHR || (mode == IFLNK && size < fs->fs_maxsymlinklen)) return (fragcnt); - for (i = 0; i < NDADDR; i++) { + for (i = 0; i < UFS_NDADDR; i++) { if (DIP(ip, di_db[i]) == 0) continue; frags = sblksize(fs, size, i); frags = numfrags(fs, frags); fragcnt += frags; visitor(ino, i, DIP(ip, di_db[i]), frags); } /* * We know the following indirects are real as we're following * real pointers to them. */ flags |= VISIT_ROOT; - for (i = 0, tmpval = NINDIR(fs), lbn = NDADDR; i < NIADDR; i++, + for (i = 0, tmpval = NINDIR(fs), lbn = UFS_NDADDR; i < UFS_NIADDR; i++, lbn = nextlbn) { nextlbn = lbn + tmpval; tmpval *= NINDIR(fs); if (DIP(ip, di_ib[i]) == 0) continue; indir_visit(ino, -lbn - i, DIP(ip, di_ib[i]), &fragcnt, visitor, flags); } return (fragcnt); } /* * Null visitor function used when we just want to count blocks and * record the lbn. */ ufs_lbn_t visitlbn; static void null_visit(ino_t ino, ufs_lbn_t lbn, ufs2_daddr_t blk, int frags) { if (lbn > 0) visitlbn = lbn; } /* * Recalculate di_blocks when we discover that a block allocation or * free was not successfully completed. The kernel does not roll this back * because it would be too expensive to compute which indirects were * reachable at the time the inode was written. */ static void ino_adjblks(struct suj_ino *sino) { union dinode *ip; uint64_t blocks; uint64_t frags; off_t isize; off_t size; ino_t ino; ino = sino->si_ino; ip = ino_read(ino); /* No need to adjust zero'd inodes. */ if (DIP(ip, di_mode) == 0) return; /* * Visit all blocks and count them as well as recording the last * valid lbn in the file. If the file size doesn't agree with the * last lbn we need to truncate to fix it. Otherwise just adjust * the blocks count. */ visitlbn = 0; frags = ino_visit(ip, ino, null_visit, VISIT_INDIR | VISIT_EXT); blocks = fsbtodb(fs, frags); /* * We assume the size and direct block list is kept coherent by * softdep. For files that have extended into indirects we truncate * to the size in the inode or the maximum size permitted by * populated indirects. */ - if (visitlbn >= NDADDR) { + if (visitlbn >= UFS_NDADDR) { isize = DIP(ip, di_size); size = lblktosize(fs, visitlbn + 1); if (isize > size) isize = size; /* Always truncate to free any unpopulated indirects. */ ino_trunc(sino->si_ino, isize); return; } if (blocks == DIP(ip, di_blocks)) return; if (debug) printf("ino %ju adjusting block count from %jd to %jd\n", (uintmax_t)ino, DIP(ip, di_blocks), blocks); DIP_SET(ip, di_blocks, blocks); ino_dirty(ino); } static void blk_free_visit(ino_t ino, ufs_lbn_t lbn, ufs2_daddr_t blk, int frags) { blk_free(blk, blk_freemask(blk, ino, lbn, frags), frags); } /* * Free a block or tree of blocks that was previously rooted in ino at * the given lbn. If the lbn is an indirect all children are freed * recursively. */ static void blk_free_lbn(ufs2_daddr_t blk, ino_t ino, ufs_lbn_t lbn, int frags, int follow) { uint64_t resid; int mask; mask = blk_freemask(blk, ino, lbn, frags); resid = 0; - if (lbn <= -NDADDR && follow && mask == 0) + if (lbn <= -UFS_NDADDR && follow && mask == 0) indir_visit(ino, lbn, blk, &resid, blk_free_visit, VISIT_INDIR); else blk_free(blk, mask, frags); } static void ino_setskip(struct suj_ino *sino, ino_t parent) { int isdot; int mode; if (ino_isat(sino->si_ino, DOTDOT_OFFSET, parent, &mode, &isdot)) sino->si_skipparent = 1; } static void ino_remref(ino_t parent, ino_t child, uint64_t diroff, int isdotdot) { struct suj_ino *sino; struct suj_rec *srec; struct jrefrec *rrec; /* * Lookup this inode to see if we have a record for it. */ sino = ino_lookup(child, 0); /* * Tell any child directories we've already removed their * parent link cnt. Don't try to adjust our link down again. */ if (sino != NULL && isdotdot == 0) ino_setskip(sino, parent); /* * No valid record for this inode. Just drop the on-disk * link by one. */ if (sino == NULL || sino->si_hasrecs == 0) { ino_decr(child); return; } /* * Use ino_adjust() if ino_check() has already processed this * child. If we lose the last non-dot reference to a * directory it will be discarded. */ if (sino->si_linkadj) { sino->si_nlink--; if (isdotdot) sino->si_dotlinks--; ino_adjust(sino); return; } /* * If we haven't yet processed this inode we need to make * sure we will successfully discover the lost path. If not * use nlinkadj to remember. */ TAILQ_FOREACH(srec, &sino->si_recs, sr_next) { rrec = (struct jrefrec *)srec->sr_rec; if (rrec->jr_parent == parent && rrec->jr_diroff == diroff) return; } sino->si_nlinkadj++; } /* * Free the children of a directory when the directory is discarded. */ static void ino_free_children(ino_t ino, ufs_lbn_t lbn, ufs2_daddr_t blk, int frags) { struct suj_ino *sino; struct direct *dp; off_t diroff; uint8_t *block; int skipparent; int isdotdot; int dpoff; int size; sino = ino_lookup(ino, 0); if (sino) skipparent = sino->si_skipparent; else skipparent = 0; size = lfragtosize(fs, frags); block = dblk_read(blk, size); dp = (struct direct *)&block[0]; for (dpoff = 0; dpoff < size && dp->d_reclen; dpoff += dp->d_reclen) { dp = (struct direct *)&block[dpoff]; - if (dp->d_ino == 0 || dp->d_ino == WINO) + if (dp->d_ino == 0 || dp->d_ino == UFS_WINO) continue; if (dp->d_namlen == 1 && dp->d_name[0] == '.') continue; isdotdot = dp->d_namlen == 2 && dp->d_name[0] == '.' && dp->d_name[1] == '.'; if (isdotdot && skipparent == 1) continue; if (debug) printf("Directory %ju removing ino %ju name %s\n", (uintmax_t)ino, (uintmax_t)dp->d_ino, dp->d_name); diroff = lblktosize(fs, lbn) + dpoff; ino_remref(ino, dp->d_ino, diroff, isdotdot); } } /* * Reclaim an inode, freeing all blocks and decrementing all children's * link counts. Free the inode back to the cg. */ static void ino_reclaim(union dinode *ip, ino_t ino, int mode) { uint32_t gen; - if (ino == ROOTINO) - err_suj("Attempting to free ROOTINO\n"); + if (ino == UFS_ROOTINO) + err_suj("Attempting to free UFS_ROOTINO\n"); if (debug) printf("Truncating and freeing ino %ju, nlink %d, mode %o\n", (uintmax_t)ino, DIP(ip, di_nlink), DIP(ip, di_mode)); /* We are freeing an inode or directory. */ if ((DIP(ip, di_mode) & IFMT) == IFDIR) ino_visit(ip, ino, ino_free_children, 0); DIP_SET(ip, di_nlink, 0); ino_visit(ip, ino, blk_free_visit, VISIT_EXT | VISIT_INDIR); /* Here we have to clear the inode and release any blocks it holds. */ gen = DIP(ip, di_gen); if (fs->fs_magic == FS_UFS1_MAGIC) bzero(ip, sizeof(struct ufs1_dinode)); else bzero(ip, sizeof(struct ufs2_dinode)); DIP_SET(ip, di_gen, gen); ino_dirty(ino); ino_free(ino, mode); return; } /* * Adjust an inode's link count down by one when a directory goes away. */ static void ino_decr(ino_t ino) { union dinode *ip; int reqlink; int nlink; int mode; ip = ino_read(ino); nlink = DIP(ip, di_nlink); mode = DIP(ip, di_mode); if (nlink < 1) err_suj("Inode %d link count %d invalid\n", ino, nlink); if (mode == 0) err_suj("Inode %d has a link of %d with 0 mode\n", ino, nlink); nlink--; if ((mode & IFMT) == IFDIR) reqlink = 2; else reqlink = 1; if (nlink < reqlink) { if (debug) printf("ino %ju not enough links to live %d < %d\n", (uintmax_t)ino, nlink, reqlink); ino_reclaim(ip, ino, mode); return; } DIP_SET(ip, di_nlink, nlink); ino_dirty(ino); } /* * Adjust the inode link count to 'nlink'. If the count reaches zero * free it. */ static void ino_adjust(struct suj_ino *sino) { struct jrefrec *rrec; struct suj_rec *srec; struct suj_ino *stmp; union dinode *ip; nlink_t nlink; int recmode; int reqlink; int isdot; int mode; ino_t ino; nlink = sino->si_nlink; ino = sino->si_ino; mode = sino->si_mode & IFMT; /* * If it's a directory with no dot links, it was truncated before * the name was cleared. We need to clear the dirent that * points at it. */ if (mode == IFDIR && nlink == 1 && sino->si_dotlinks == 0) { sino->si_nlink = nlink = 0; TAILQ_FOREACH(srec, &sino->si_recs, sr_next) { rrec = (struct jrefrec *)srec->sr_rec; if (ino_isat(rrec->jr_parent, rrec->jr_diroff, ino, &recmode, &isdot) == 0) continue; ino_clrat(rrec->jr_parent, rrec->jr_diroff, ino); break; } if (srec == NULL) errx(1, "Directory %ju name not found", (uintmax_t)ino); } /* * If it's a directory with no real names pointing to it go ahead * and truncate it. This will free any children. */ if (mode == IFDIR && nlink - sino->si_dotlinks == 0) { sino->si_nlink = nlink = 0; /* * Mark any .. links so they know not to free this inode * when they are removed. */ TAILQ_FOREACH(srec, &sino->si_recs, sr_next) { rrec = (struct jrefrec *)srec->sr_rec; if (rrec->jr_diroff == DOTDOT_OFFSET) { stmp = ino_lookup(rrec->jr_parent, 0); if (stmp) ino_setskip(stmp, ino); } } } ip = ino_read(ino); mode = DIP(ip, di_mode) & IFMT; if (nlink > LINK_MAX) err_suj("ino %ju nlink manipulation error, new %ju, old %d\n", (uintmax_t)ino, (uintmax_t)nlink, DIP(ip, di_nlink)); if (debug) printf("Adjusting ino %ju, nlink %ju, old link %d lastmode %o\n", (uintmax_t)ino, (uintmax_t)nlink, DIP(ip, di_nlink), sino->si_mode); if (mode == 0) { if (debug) printf("ino %ju, zero inode freeing bitmap\n", (uintmax_t)ino); ino_free(ino, sino->si_mode); return; } /* XXX Should be an assert? */ if (mode != sino->si_mode && debug) printf("ino %ju, mode %o != %o\n", (uintmax_t)ino, mode, sino->si_mode); if ((mode & IFMT) == IFDIR) reqlink = 2; else reqlink = 1; /* If the inode doesn't have enough links to live, free it. */ if (nlink < reqlink) { if (debug) printf("ino %ju not enough links to live %ju < %ju\n", (uintmax_t)ino, (uintmax_t)nlink, (uintmax_t)reqlink); ino_reclaim(ip, ino, mode); return; } /* If required write the updated link count. */ if (DIP(ip, di_nlink) == nlink) { if (debug) printf("ino %ju, link matches, skipping.\n", (uintmax_t)ino); return; } DIP_SET(ip, di_nlink, nlink); ino_dirty(ino); } /* * Truncate some or all blocks in an indirect, freeing any that are required * and zeroing the indirect. */ static void indir_trunc(ino_t ino, ufs_lbn_t lbn, ufs2_daddr_t blk, ufs_lbn_t lastlbn) { ufs2_daddr_t *bap2; ufs1_daddr_t *bap1; ufs_lbn_t lbnadd; ufs2_daddr_t nblk; ufs_lbn_t next; ufs_lbn_t nlbn; int dirty; int level; int i; if (blk == 0) return; dirty = 0; level = lbn_level(lbn); if (level == -1) err_suj("Invalid level for lbn %jd\n", lbn); lbnadd = 1; for (i = level; i > 0; i--) lbnadd *= NINDIR(fs); bap1 = (void *)dblk_read(blk, fs->fs_bsize); bap2 = (void *)bap1; for (i = 0; i < NINDIR(fs); i++) { if (fs->fs_magic == FS_UFS1_MAGIC) nblk = *bap1++; else nblk = *bap2++; if (nblk == 0) continue; if (level != 0) { nlbn = (lbn + 1) - (i * lbnadd); /* * Calculate the lbn of the next indirect to * determine if any of this indirect must be * reclaimed. */ next = -(lbn + level) + ((i+1) * lbnadd); if (next <= lastlbn) continue; indir_trunc(ino, nlbn, nblk, lastlbn); /* If all of this indirect was reclaimed, free it. */ nlbn = next - lbnadd; if (nlbn < lastlbn) continue; } else { nlbn = -lbn + i * lbnadd; if (nlbn < lastlbn) continue; } dirty = 1; blk_free(nblk, 0, fs->fs_frag); if (fs->fs_magic == FS_UFS1_MAGIC) *(bap1 - 1) = 0; else *(bap2 - 1) = 0; } if (dirty) dblk_dirty(blk); } /* * Truncate an inode to the minimum of the given size or the last populated * block after any over size have been discarded. The kernel would allocate * the last block in the file but fsck does not and neither do we. This * code never extends files, only shrinks them. */ static void ino_trunc(ino_t ino, off_t size) { union dinode *ip; ufs2_daddr_t bn; uint64_t totalfrags; ufs_lbn_t nextlbn; ufs_lbn_t lastlbn; ufs_lbn_t tmpval; ufs_lbn_t lbn; ufs_lbn_t i; int frags; off_t cursize; off_t off; int mode; ip = ino_read(ino); mode = DIP(ip, di_mode) & IFMT; cursize = DIP(ip, di_size); if (debug) printf("Truncating ino %ju, mode %o to size %jd from size %jd\n", (uintmax_t)ino, mode, size, cursize); /* Skip datablocks for short links and devices. */ if (mode == 0 || mode == IFBLK || mode == IFCHR || (mode == IFLNK && cursize < fs->fs_maxsymlinklen)) return; /* Don't extend. */ if (size > cursize) size = cursize; lastlbn = lblkno(fs, blkroundup(fs, size)); - for (i = lastlbn; i < NDADDR; i++) { + for (i = lastlbn; i < UFS_NDADDR; i++) { if (DIP(ip, di_db[i]) == 0) continue; frags = sblksize(fs, cursize, i); frags = numfrags(fs, frags); blk_free(DIP(ip, di_db[i]), 0, frags); DIP_SET(ip, di_db[i], 0); } /* * Follow indirect blocks, freeing anything required. */ - for (i = 0, tmpval = NINDIR(fs), lbn = NDADDR; i < NIADDR; i++, + for (i = 0, tmpval = NINDIR(fs), lbn = UFS_NDADDR; i < UFS_NIADDR; i++, lbn = nextlbn) { nextlbn = lbn + tmpval; tmpval *= NINDIR(fs); /* If we're not freeing any in this indirect range skip it. */ if (lastlbn >= nextlbn) continue; if (DIP(ip, di_ib[i]) == 0) continue; indir_trunc(ino, -lbn - i, DIP(ip, di_ib[i]), lastlbn); /* If we freed everything in this indirect free the indir. */ if (lastlbn > lbn) continue; blk_free(DIP(ip, di_ib[i]), 0, frags); DIP_SET(ip, di_ib[i], 0); } ino_dirty(ino); /* * Now that we've freed any whole blocks that exceed the desired * truncation size, figure out how many blocks remain and what the * last populated lbn is. We will set the size to this last lbn * rather than worrying about allocating the final lbn as the kernel * would've done. This is consistent with normal fsck behavior. */ visitlbn = 0; totalfrags = ino_visit(ip, ino, null_visit, VISIT_INDIR | VISIT_EXT); if (size > lblktosize(fs, visitlbn + 1)) size = lblktosize(fs, visitlbn + 1); /* * If we're truncating direct blocks we have to adjust frags * accordingly. */ - if (visitlbn < NDADDR && totalfrags) { + if (visitlbn < UFS_NDADDR && totalfrags) { long oldspace, newspace; bn = DIP(ip, di_db[visitlbn]); if (bn == 0) err_suj("Bad blk at ino %ju lbn %jd\n", (uintmax_t)ino, visitlbn); oldspace = sblksize(fs, cursize, visitlbn); newspace = sblksize(fs, size, visitlbn); if (oldspace != newspace) { bn += numfrags(fs, newspace); frags = numfrags(fs, oldspace - newspace); blk_free(bn, 0, frags); totalfrags -= frags; } } DIP_SET(ip, di_blocks, fsbtodb(fs, totalfrags)); DIP_SET(ip, di_size, size); /* * If we've truncated into the middle of a block or frag we have * to zero it here. Otherwise the file could extend into * uninitialized space later. */ off = blkoff(fs, size); if (off && DIP(ip, di_mode) != IFDIR) { uint8_t *buf; long clrsize; bn = ino_blkatoff(ip, ino, visitlbn, &frags); if (bn == 0) err_suj("Block missing from ino %ju at lbn %jd\n", (uintmax_t)ino, visitlbn); clrsize = frags * fs->fs_fsize; buf = dblk_read(bn, clrsize); clrsize -= off; buf += off; bzero(buf, clrsize); dblk_dirty(bn); } return; } /* * Process records available for one inode and determine whether the * link count is correct or needs adjusting. */ static void ino_check(struct suj_ino *sino) { struct suj_rec *srec; struct jrefrec *rrec; nlink_t dotlinks; int newlinks; int removes; int nlink; ino_t ino; int isdot; int isat; int mode; if (sino->si_hasrecs == 0) return; ino = sino->si_ino; rrec = (struct jrefrec *)TAILQ_FIRST(&sino->si_recs)->sr_rec; nlink = rrec->jr_nlink; newlinks = 0; dotlinks = 0; removes = sino->si_nlinkadj; TAILQ_FOREACH(srec, &sino->si_recs, sr_next) { rrec = (struct jrefrec *)srec->sr_rec; isat = ino_isat(rrec->jr_parent, rrec->jr_diroff, rrec->jr_ino, &mode, &isdot); if (isat && (mode & IFMT) != (rrec->jr_mode & IFMT)) err_suj("Inode mode/directory type mismatch %o != %o\n", mode, rrec->jr_mode); if (debug) printf("jrefrec: op %d ino %ju, nlink %ju, parent %ju, " "diroff %jd, mode %o, isat %d, isdot %d\n", rrec->jr_op, (uintmax_t)rrec->jr_ino, (uintmax_t)rrec->jr_nlink, (uintmax_t)rrec->jr_parent, (uintmax_t)rrec->jr_diroff, rrec->jr_mode, isat, isdot); mode = rrec->jr_mode & IFMT; if (rrec->jr_op == JOP_REMREF) removes++; newlinks += isat; if (isdot) dotlinks += isat; } /* * The number of links that remain are the starting link count * subtracted by the total number of removes with the total * links discovered back in. An incomplete remove thus * makes no change to the link count but an add increases * by one. */ if (debug) printf( "ino %ju nlink %ju newlinks %ju removes %ju dotlinks %ju\n", (uintmax_t)ino, (uintmax_t)nlink, (uintmax_t)newlinks, (uintmax_t)removes, (uintmax_t)dotlinks); nlink += newlinks; nlink -= removes; sino->si_linkadj = 1; sino->si_nlink = nlink; sino->si_dotlinks = dotlinks; sino->si_mode = mode; ino_adjust(sino); } /* * Process records available for one block and determine whether it is * still allocated and whether the owning inode needs to be updated or * a free completed. */ static void blk_check(struct suj_blk *sblk) { struct suj_rec *srec; struct jblkrec *brec; struct suj_ino *sino; ufs2_daddr_t blk; int mask; int frags; int isat; /* * Each suj_blk actually contains records for any fragments in that * block. As a result we must evaluate each record individually. */ sino = NULL; TAILQ_FOREACH(srec, &sblk->sb_recs, sr_next) { brec = (struct jblkrec *)srec->sr_rec; frags = brec->jb_frags; blk = brec->jb_blkno + brec->jb_oldfrags; isat = blk_isat(brec->jb_ino, brec->jb_lbn, blk, &frags); if (sino == NULL || sino->si_ino != brec->jb_ino) { sino = ino_lookup(brec->jb_ino, 1); sino->si_blkadj = 1; } if (debug) printf("op %d blk %jd ino %ju lbn %jd frags %d isat %d (%d)\n", brec->jb_op, blk, (uintmax_t)brec->jb_ino, brec->jb_lbn, brec->jb_frags, isat, frags); /* * If we found the block at this address we still have to * determine if we need to free the tail end that was * added by adding contiguous fragments from the same block. */ if (isat == 1) { if (frags == brec->jb_frags) continue; mask = blk_freemask(blk, brec->jb_ino, brec->jb_lbn, brec->jb_frags); mask >>= frags; blk += frags; frags = brec->jb_frags - frags; blk_free(blk, mask, frags); continue; } /* * The block wasn't found, attempt to free it. It won't be * freed if it was actually reallocated. If this was an * allocation we don't want to follow indirects as they * may not be written yet. Any children of the indirect will * have their own records. If it's a free we need to * recursively free children. */ blk_free_lbn(blk, brec->jb_ino, brec->jb_lbn, brec->jb_frags, brec->jb_op == JOP_FREEBLK); } } /* * Walk the list of inode records for this cg and resolve moved and duplicate * inode references now that we have a complete picture. */ static void cg_build(struct suj_cg *sc) { struct suj_ino *sino; int i; for (i = 0; i < SUJ_HASHSIZE; i++) LIST_FOREACH(sino, &sc->sc_inohash[i], si_next) ino_build(sino); } /* * Handle inodes requiring truncation. This must be done prior to * looking up any inodes in directories. */ static void cg_trunc(struct suj_cg *sc) { struct suj_ino *sino; int i; for (i = 0; i < SUJ_HASHSIZE; i++) { LIST_FOREACH(sino, &sc->sc_inohash[i], si_next) { if (sino->si_trunc) { ino_trunc(sino->si_ino, sino->si_trunc->jt_size); sino->si_blkadj = 0; sino->si_trunc = NULL; } if (sino->si_blkadj) ino_adjblks(sino); } } } static void cg_adj_blk(struct suj_cg *sc) { struct suj_ino *sino; int i; for (i = 0; i < SUJ_HASHSIZE; i++) { LIST_FOREACH(sino, &sc->sc_inohash[i], si_next) { if (sino->si_blkadj) ino_adjblks(sino); } } } /* * Free any partially allocated blocks and then resolve inode block * counts. */ static void cg_check_blk(struct suj_cg *sc) { struct suj_blk *sblk; int i; for (i = 0; i < SUJ_HASHSIZE; i++) LIST_FOREACH(sblk, &sc->sc_blkhash[i], sb_next) blk_check(sblk); } /* * Walk the list of inode records for this cg, recovering any * changes which were not complete at the time of crash. */ static void cg_check_ino(struct suj_cg *sc) { struct suj_ino *sino; int i; for (i = 0; i < SUJ_HASHSIZE; i++) LIST_FOREACH(sino, &sc->sc_inohash[i], si_next) ino_check(sino); } /* * Write a potentially dirty cg. Recalculate the summary information and * update the superblock summary. */ static void cg_write(struct suj_cg *sc) { ufs1_daddr_t fragno, cgbno, maxbno; u_int8_t *blksfree; struct cg *cgp; int blk; int i; if (sc->sc_dirty == 0) return; /* * Fix the frag and cluster summary. */ cgp = sc->sc_cgp; cgp->cg_cs.cs_nbfree = 0; cgp->cg_cs.cs_nffree = 0; bzero(&cgp->cg_frsum, sizeof(cgp->cg_frsum)); maxbno = fragstoblks(fs, fs->fs_fpg); if (fs->fs_contigsumsize > 0) { for (i = 1; i <= fs->fs_contigsumsize; i++) cg_clustersum(cgp)[i] = 0; bzero(cg_clustersfree(cgp), howmany(maxbno, CHAR_BIT)); } blksfree = cg_blksfree(cgp); for (cgbno = 0; cgbno < maxbno; cgbno++) { if (ffs_isfreeblock(fs, blksfree, cgbno)) continue; if (ffs_isblock(fs, blksfree, cgbno)) { ffs_clusteracct(fs, cgp, cgbno, 1); cgp->cg_cs.cs_nbfree++; continue; } fragno = blkstofrags(fs, cgbno); blk = blkmap(fs, blksfree, fragno); ffs_fragacct(fs, blk, cgp->cg_frsum, 1); for (i = 0; i < fs->fs_frag; i++) if (isset(blksfree, fragno + i)) cgp->cg_cs.cs_nffree++; } /* * Update the superblock cg summary from our now correct values * before writing the block. */ fs->fs_cs(fs, sc->sc_cgx) = cgp->cg_cs; if (bwrite(disk, fsbtodb(fs, cgtod(fs, sc->sc_cgx)), sc->sc_cgbuf, fs->fs_bsize) == -1) err_suj("Unable to write cylinder group %d\n", sc->sc_cgx); } /* * Write out any modified inodes. */ static void cg_write_inos(struct suj_cg *sc) { struct ino_blk *iblk; int i; for (i = 0; i < SUJ_HASHSIZE; i++) LIST_FOREACH(iblk, &sc->sc_iblkhash[i], ib_next) if (iblk->ib_dirty) iblk_write(iblk); } static void cg_apply(void (*apply)(struct suj_cg *)) { struct suj_cg *scg; int i; for (i = 0; i < SUJ_HASHSIZE; i++) LIST_FOREACH(scg, &cghash[i], sc_next) apply(scg); } /* * Process the unlinked but referenced file list. Freeing all inodes. */ static void ino_unlinked(void) { union dinode *ip; uint16_t mode; ino_t inon; ino_t ino; ino = fs->fs_sujfree; fs->fs_sujfree = 0; while (ino != 0) { ip = ino_read(ino); mode = DIP(ip, di_mode) & IFMT; inon = DIP(ip, di_freelink); DIP_SET(ip, di_freelink, 0); /* * XXX Should this be an errx? */ if (DIP(ip, di_nlink) == 0) { if (debug) printf("Freeing unlinked ino %ju mode %o\n", (uintmax_t)ino, mode); ino_reclaim(ip, ino, mode); } else if (debug) printf("Skipping ino %ju mode %o with link %d\n", (uintmax_t)ino, mode, DIP(ip, di_nlink)); ino = inon; } } /* * Append a new record to the list of records requiring processing. */ static void ino_append(union jrec *rec) { struct jrefrec *refrec; struct jmvrec *mvrec; struct suj_ino *sino; struct suj_rec *srec; mvrec = &rec->rec_jmvrec; refrec = &rec->rec_jrefrec; if (debug && mvrec->jm_op == JOP_MVREF) printf("ino move: ino %ju, parent %ju, " "diroff %jd, oldoff %jd\n", (uintmax_t)mvrec->jm_ino, (uintmax_t)mvrec->jm_parent, (uintmax_t)mvrec->jm_newoff, (uintmax_t)mvrec->jm_oldoff); else if (debug && (refrec->jr_op == JOP_ADDREF || refrec->jr_op == JOP_REMREF)) printf("ino ref: op %d, ino %ju, nlink %ju, " "parent %ju, diroff %jd\n", refrec->jr_op, (uintmax_t)refrec->jr_ino, (uintmax_t)refrec->jr_nlink, (uintmax_t)refrec->jr_parent, (uintmax_t)refrec->jr_diroff); sino = ino_lookup(((struct jrefrec *)rec)->jr_ino, 1); sino->si_hasrecs = 1; srec = errmalloc(sizeof(*srec)); srec->sr_rec = rec; TAILQ_INSERT_TAIL(&sino->si_newrecs, srec, sr_next); } /* * Add a reference adjustment to the sino list and eliminate dups. The * primary loop in ino_build_ref() checks for dups but new ones may be * created as a result of offset adjustments. */ static void ino_add_ref(struct suj_ino *sino, struct suj_rec *srec) { struct jrefrec *refrec; struct suj_rec *srn; struct jrefrec *rrn; refrec = (struct jrefrec *)srec->sr_rec; /* * We walk backwards so that the oldest link count is preserved. If * an add record conflicts with a remove keep the remove. Redundant * removes are eliminated in ino_build_ref. Otherwise we keep the * oldest record at a given location. */ for (srn = TAILQ_LAST(&sino->si_recs, srechd); srn; srn = TAILQ_PREV(srn, srechd, sr_next)) { rrn = (struct jrefrec *)srn->sr_rec; if (rrn->jr_parent != refrec->jr_parent || rrn->jr_diroff != refrec->jr_diroff) continue; if (rrn->jr_op == JOP_REMREF || refrec->jr_op == JOP_ADDREF) { rrn->jr_mode = refrec->jr_mode; return; } /* * Adding a remove. * * Replace the record in place with the old nlink in case * we replace the head of the list. Abandon srec as a dup. */ refrec->jr_nlink = rrn->jr_nlink; srn->sr_rec = srec->sr_rec; return; } TAILQ_INSERT_TAIL(&sino->si_recs, srec, sr_next); } /* * Create a duplicate of a reference at a previous location. */ static void ino_dup_ref(struct suj_ino *sino, struct jrefrec *refrec, off_t diroff) { struct jrefrec *rrn; struct suj_rec *srn; rrn = errmalloc(sizeof(*refrec)); *rrn = *refrec; rrn->jr_op = JOP_ADDREF; rrn->jr_diroff = diroff; srn = errmalloc(sizeof(*srn)); srn->sr_rec = (union jrec *)rrn; ino_add_ref(sino, srn); } /* * Add a reference to the list at all known locations. We follow the offset * changes for a single instance and create duplicate add refs at each so * that we can tolerate any version of the directory block. Eliminate * removes which collide with adds that are seen in the journal. They should * not adjust the link count down. */ static void ino_build_ref(struct suj_ino *sino, struct suj_rec *srec) { struct jrefrec *refrec; struct jmvrec *mvrec; struct suj_rec *srp; struct suj_rec *srn; struct jrefrec *rrn; off_t diroff; refrec = (struct jrefrec *)srec->sr_rec; /* * Search for a mvrec that matches this offset. Whether it's an add * or a remove we can delete the mvref after creating a dup record in * the old location. */ if (!TAILQ_EMPTY(&sino->si_movs)) { diroff = refrec->jr_diroff; for (srn = TAILQ_LAST(&sino->si_movs, srechd); srn; srn = srp) { srp = TAILQ_PREV(srn, srechd, sr_next); mvrec = (struct jmvrec *)srn->sr_rec; if (mvrec->jm_parent != refrec->jr_parent || mvrec->jm_newoff != diroff) continue; diroff = mvrec->jm_oldoff; TAILQ_REMOVE(&sino->si_movs, srn, sr_next); free(srn); ino_dup_ref(sino, refrec, diroff); } } /* * If a remove wasn't eliminated by an earlier add just append it to * the list. */ if (refrec->jr_op == JOP_REMREF) { ino_add_ref(sino, srec); return; } /* * Walk the list of records waiting to be added to the list. We * must check for moves that apply to our current offset and remove * them from the list. Remove any duplicates to eliminate removes * with corresponding adds. */ TAILQ_FOREACH_SAFE(srn, &sino->si_newrecs, sr_next, srp) { switch (srn->sr_rec->rec_jrefrec.jr_op) { case JOP_ADDREF: /* * This should actually be an error we should * have a remove for every add journaled. */ rrn = (struct jrefrec *)srn->sr_rec; if (rrn->jr_parent != refrec->jr_parent || rrn->jr_diroff != refrec->jr_diroff) break; TAILQ_REMOVE(&sino->si_newrecs, srn, sr_next); break; case JOP_REMREF: /* * Once we remove the current iteration of the * record at this address we're done. */ rrn = (struct jrefrec *)srn->sr_rec; if (rrn->jr_parent != refrec->jr_parent || rrn->jr_diroff != refrec->jr_diroff) break; TAILQ_REMOVE(&sino->si_newrecs, srn, sr_next); ino_add_ref(sino, srec); return; case JOP_MVREF: /* * Update our diroff based on any moves that match * and remove the move. */ mvrec = (struct jmvrec *)srn->sr_rec; if (mvrec->jm_parent != refrec->jr_parent || mvrec->jm_oldoff != refrec->jr_diroff) break; ino_dup_ref(sino, refrec, mvrec->jm_oldoff); refrec->jr_diroff = mvrec->jm_newoff; TAILQ_REMOVE(&sino->si_newrecs, srn, sr_next); break; default: err_suj("ino_build_ref: Unknown op %d\n", srn->sr_rec->rec_jrefrec.jr_op); } } ino_add_ref(sino, srec); } /* * Walk the list of new records and add them in-order resolving any * dups and adjusted offsets. */ static void ino_build(struct suj_ino *sino) { struct suj_rec *srec; while ((srec = TAILQ_FIRST(&sino->si_newrecs)) != NULL) { TAILQ_REMOVE(&sino->si_newrecs, srec, sr_next); switch (srec->sr_rec->rec_jrefrec.jr_op) { case JOP_ADDREF: case JOP_REMREF: ino_build_ref(sino, srec); break; case JOP_MVREF: /* * Add this mvrec to the queue of pending mvs. */ TAILQ_INSERT_TAIL(&sino->si_movs, srec, sr_next); break; default: err_suj("ino_build: Unknown op %d\n", srec->sr_rec->rec_jrefrec.jr_op); } } if (TAILQ_EMPTY(&sino->si_recs)) sino->si_hasrecs = 0; } /* * Modify journal records so they refer to the base block number * and a start and end frag range. This is to facilitate the discovery * of overlapping fragment allocations. */ static void blk_build(struct jblkrec *blkrec) { struct suj_rec *srec; struct suj_blk *sblk; struct jblkrec *blkrn; ufs2_daddr_t blk; int frag; if (debug) printf("blk_build: op %d blkno %jd frags %d oldfrags %d " "ino %ju lbn %jd\n", blkrec->jb_op, (uintmax_t)blkrec->jb_blkno, blkrec->jb_frags, blkrec->jb_oldfrags, (uintmax_t)blkrec->jb_ino, (uintmax_t)blkrec->jb_lbn); blk = blknum(fs, blkrec->jb_blkno); frag = fragnum(fs, blkrec->jb_blkno); sblk = blk_lookup(blk, 1); /* * Rewrite the record using oldfrags to indicate the offset into * the block. Leave jb_frags as the actual allocated count. */ blkrec->jb_blkno -= frag; blkrec->jb_oldfrags = frag; if (blkrec->jb_oldfrags + blkrec->jb_frags > fs->fs_frag) err_suj("Invalid fragment count %d oldfrags %d\n", blkrec->jb_frags, frag); /* * Detect dups. If we detect a dup we always discard the oldest * record as it is superseded by the new record. This speeds up * later stages but also eliminates free records which are used * to indicate that the contents of indirects can be trusted. */ TAILQ_FOREACH(srec, &sblk->sb_recs, sr_next) { blkrn = (struct jblkrec *)srec->sr_rec; if (blkrn->jb_ino != blkrec->jb_ino || blkrn->jb_lbn != blkrec->jb_lbn || blkrn->jb_blkno != blkrec->jb_blkno || blkrn->jb_frags != blkrec->jb_frags || blkrn->jb_oldfrags != blkrec->jb_oldfrags) continue; if (debug) printf("Removed dup.\n"); /* Discard the free which is a dup with an alloc. */ if (blkrec->jb_op == JOP_FREEBLK) return; TAILQ_REMOVE(&sblk->sb_recs, srec, sr_next); free(srec); break; } srec = errmalloc(sizeof(*srec)); srec->sr_rec = (union jrec *)blkrec; TAILQ_INSERT_TAIL(&sblk->sb_recs, srec, sr_next); } static void ino_build_trunc(struct jtrncrec *rec) { struct suj_ino *sino; if (debug) printf("ino_build_trunc: op %d ino %ju, size %jd\n", rec->jt_op, (uintmax_t)rec->jt_ino, (uintmax_t)rec->jt_size); sino = ino_lookup(rec->jt_ino, 1); if (rec->jt_op == JOP_SYNC) { sino->si_trunc = NULL; return; } if (sino->si_trunc == NULL || sino->si_trunc->jt_size > rec->jt_size) sino->si_trunc = rec; } /* * Build up tables of the operations we need to recover. */ static void suj_build(void) { struct suj_seg *seg; union jrec *rec; int off; int i; TAILQ_FOREACH(seg, &allsegs, ss_next) { if (debug) printf("seg %jd has %d records, oldseq %jd.\n", seg->ss_rec.jsr_seq, seg->ss_rec.jsr_cnt, seg->ss_rec.jsr_oldest); off = 0; rec = (union jrec *)seg->ss_blk; for (i = 0; i < seg->ss_rec.jsr_cnt; off += JREC_SIZE, rec++) { /* skip the segrec. */ if ((off % real_dev_bsize) == 0) continue; switch (rec->rec_jrefrec.jr_op) { case JOP_ADDREF: case JOP_REMREF: case JOP_MVREF: ino_append(rec); break; case JOP_NEWBLK: case JOP_FREEBLK: blk_build((struct jblkrec *)rec); break; case JOP_TRUNC: case JOP_SYNC: ino_build_trunc((struct jtrncrec *)rec); break; default: err_suj("Unknown journal operation %d (%d)\n", rec->rec_jrefrec.jr_op, off); } i++; } } } /* * Prune the journal segments to those we care about based on the * oldest sequence in the newest segment. Order the segment list * based on sequence number. */ static void suj_prune(void) { struct suj_seg *seg; struct suj_seg *segn; uint64_t newseq; int discard; if (debug) printf("Pruning up to %jd\n", oldseq); /* First free the expired segments. */ TAILQ_FOREACH_SAFE(seg, &allsegs, ss_next, segn) { if (seg->ss_rec.jsr_seq >= oldseq) continue; TAILQ_REMOVE(&allsegs, seg, ss_next); free(seg->ss_blk); free(seg); } /* Next ensure that segments are ordered properly. */ seg = TAILQ_FIRST(&allsegs); if (seg == NULL) { if (debug) printf("Empty journal\n"); return; } newseq = seg->ss_rec.jsr_seq; for (;;) { seg = TAILQ_LAST(&allsegs, seghd); if (seg->ss_rec.jsr_seq >= newseq) break; TAILQ_REMOVE(&allsegs, seg, ss_next); TAILQ_INSERT_HEAD(&allsegs, seg, ss_next); newseq = seg->ss_rec.jsr_seq; } if (newseq != oldseq) { TAILQ_FOREACH(seg, &allsegs, ss_next) { printf("%jd, ", seg->ss_rec.jsr_seq); } printf("\n"); err_suj("Journal file sequence mismatch %jd != %jd\n", newseq, oldseq); } /* * The kernel may asynchronously write segments which can create * gaps in the sequence space. Throw away any segments after the * gap as the kernel guarantees only those that are contiguously * reachable are marked as completed. */ discard = 0; TAILQ_FOREACH_SAFE(seg, &allsegs, ss_next, segn) { if (!discard && newseq++ == seg->ss_rec.jsr_seq) { jrecs += seg->ss_rec.jsr_cnt; jbytes += seg->ss_rec.jsr_blocks * real_dev_bsize; continue; } discard = 1; if (debug) printf("Journal order mismatch %jd != %jd pruning\n", newseq-1, seg->ss_rec.jsr_seq); TAILQ_REMOVE(&allsegs, seg, ss_next); free(seg->ss_blk); free(seg); } if (debug) printf("Processing journal segments from %jd to %jd\n", oldseq, newseq-1); } /* * Verify the journal inode before attempting to read records. */ static int suj_verifyino(union dinode *ip) { if (DIP(ip, di_nlink) != 1) { printf("Invalid link count %d for journal inode %ju\n", DIP(ip, di_nlink), (uintmax_t)sujino); return (-1); } if ((DIP(ip, di_flags) & (SF_IMMUTABLE | SF_NOUNLINK)) != (SF_IMMUTABLE | SF_NOUNLINK)) { printf("Invalid flags 0x%X for journal inode %ju\n", DIP(ip, di_flags), (uintmax_t)sujino); return (-1); } if (DIP(ip, di_mode) != (IFREG | IREAD)) { printf("Invalid mode %o for journal inode %ju\n", DIP(ip, di_mode), (uintmax_t)sujino); return (-1); } if (DIP(ip, di_size) < SUJ_MIN) { printf("Invalid size %jd for journal inode %ju\n", DIP(ip, di_size), (uintmax_t)sujino); return (-1); } if (DIP(ip, di_modrev) != fs->fs_mtime) { printf("Journal timestamp does not match fs mount time\n"); return (-1); } return (0); } struct jblocks { struct jextent *jb_extent; /* Extent array. */ int jb_avail; /* Available extents. */ int jb_used; /* Last used extent. */ int jb_head; /* Allocator head. */ int jb_off; /* Allocator extent offset. */ }; struct jextent { ufs2_daddr_t je_daddr; /* Disk block address. */ int je_blocks; /* Disk block count. */ }; static struct jblocks *suj_jblocks; static struct jblocks * jblocks_create(void) { struct jblocks *jblocks; int size; jblocks = errmalloc(sizeof(*jblocks)); jblocks->jb_avail = 10; jblocks->jb_used = 0; jblocks->jb_head = 0; jblocks->jb_off = 0; size = sizeof(struct jextent) * jblocks->jb_avail; jblocks->jb_extent = errmalloc(size); bzero(jblocks->jb_extent, size); return (jblocks); } /* * Return the next available disk block and the amount of contiguous * free space it contains. */ static ufs2_daddr_t jblocks_next(struct jblocks *jblocks, int bytes, int *actual) { struct jextent *jext; ufs2_daddr_t daddr; int freecnt; int blocks; blocks = bytes / disk->d_bsize; jext = &jblocks->jb_extent[jblocks->jb_head]; freecnt = jext->je_blocks - jblocks->jb_off; if (freecnt == 0) { jblocks->jb_off = 0; if (++jblocks->jb_head > jblocks->jb_used) return (0); jext = &jblocks->jb_extent[jblocks->jb_head]; freecnt = jext->je_blocks; } if (freecnt > blocks) freecnt = blocks; *actual = freecnt * disk->d_bsize; daddr = jext->je_daddr + jblocks->jb_off; return (daddr); } /* * Advance the allocation head by a specified number of bytes, consuming * one journal segment. */ static void jblocks_advance(struct jblocks *jblocks, int bytes) { jblocks->jb_off += bytes / disk->d_bsize; } static void jblocks_destroy(struct jblocks *jblocks) { free(jblocks->jb_extent); free(jblocks); } static void jblocks_add(struct jblocks *jblocks, ufs2_daddr_t daddr, int blocks) { struct jextent *jext; int size; jext = &jblocks->jb_extent[jblocks->jb_used]; /* Adding the first block. */ if (jext->je_daddr == 0) { jext->je_daddr = daddr; jext->je_blocks = blocks; return; } /* Extending the last extent. */ if (jext->je_daddr + jext->je_blocks == daddr) { jext->je_blocks += blocks; return; } /* Adding a new extent. */ if (++jblocks->jb_used == jblocks->jb_avail) { jblocks->jb_avail *= 2; size = sizeof(struct jextent) * jblocks->jb_avail; jext = errmalloc(size); bzero(jext, size); bcopy(jblocks->jb_extent, jext, sizeof(struct jextent) * jblocks->jb_used); free(jblocks->jb_extent); jblocks->jb_extent = jext; } jext = &jblocks->jb_extent[jblocks->jb_used]; jext->je_daddr = daddr; jext->je_blocks = blocks; return; } /* * Add a file block from the journal to the extent map. We can't read * each file block individually because the kernel treats it as a circular * buffer and segments may span mutliple contiguous blocks. */ static void suj_add_block(ino_t ino, ufs_lbn_t lbn, ufs2_daddr_t blk, int frags) { jblocks_add(suj_jblocks, fsbtodb(fs, blk), fsbtodb(fs, frags)); } static void suj_read(void) { uint8_t block[1 * 1024 * 1024]; struct suj_seg *seg; struct jsegrec *recn; struct jsegrec *rec; ufs2_daddr_t blk; int readsize; int blocks; int recsize; int size; int i; /* * Read records until we exhaust the journal space. If we find * an invalid record we start searching for a valid segment header * at the next block. This is because we don't have a head/tail * pointer and must recover the information indirectly. At the gap * between the head and tail we won't necessarily have a valid * segment. */ restart: for (;;) { size = sizeof(block); blk = jblocks_next(suj_jblocks, size, &readsize); if (blk == 0) return; size = readsize; /* * Read 1MB at a time and scan for records within this block. */ if (bread(disk, blk, &block, size) == -1) { err_suj("Error reading journal block %jd\n", (intmax_t)blk); } for (rec = (void *)block; size; size -= recsize, rec = (struct jsegrec *)((uintptr_t)rec + recsize)) { recsize = real_dev_bsize; if (rec->jsr_time != fs->fs_mtime) { if (debug) printf("Rec time %jd != fs mtime %jd\n", rec->jsr_time, fs->fs_mtime); jblocks_advance(suj_jblocks, recsize); continue; } if (rec->jsr_cnt == 0) { if (debug) printf("Found illegal count %d\n", rec->jsr_cnt); jblocks_advance(suj_jblocks, recsize); continue; } blocks = rec->jsr_blocks; recsize = blocks * real_dev_bsize; if (recsize > size) { /* * We may just have run out of buffer, restart * the loop to re-read from this spot. */ if (size < fs->fs_bsize && size != readsize && recsize <= fs->fs_bsize) goto restart; if (debug) printf("Found invalid segsize %d > %d\n", recsize, size); recsize = real_dev_bsize; jblocks_advance(suj_jblocks, recsize); continue; } /* * Verify that all blocks in the segment are present. */ for (i = 1; i < blocks; i++) { recn = (void *)((uintptr_t)rec) + i * real_dev_bsize; if (recn->jsr_seq == rec->jsr_seq && recn->jsr_time == rec->jsr_time) continue; if (debug) printf("Incomplete record %jd (%d)\n", rec->jsr_seq, i); recsize = i * real_dev_bsize; jblocks_advance(suj_jblocks, recsize); goto restart; } seg = errmalloc(sizeof(*seg)); seg->ss_blk = errmalloc(recsize); seg->ss_rec = *rec; bcopy((void *)rec, seg->ss_blk, recsize); if (rec->jsr_oldest > oldseq) oldseq = rec->jsr_oldest; TAILQ_INSERT_TAIL(&allsegs, seg, ss_next); jblocks_advance(suj_jblocks, recsize); } } } /* * Search a directory block for the SUJ_FILE. */ static void suj_find(ino_t ino, ufs_lbn_t lbn, ufs2_daddr_t blk, int frags) { char block[MAXBSIZE]; struct direct *dp; int bytes; int off; if (sujino) return; bytes = lfragtosize(fs, frags); if (bread(disk, fsbtodb(fs, blk), block, bytes) <= 0) - err_suj("Failed to read ROOTINO directory block %jd\n", blk); + err_suj("Failed to read UFS_ROOTINO directory block %jd\n", + blk); for (off = 0; off < bytes; off += dp->d_reclen) { dp = (struct direct *)&block[off]; if (dp->d_reclen == 0) break; if (dp->d_ino == 0) continue; if (dp->d_namlen != strlen(SUJ_FILE)) continue; if (bcmp(dp->d_name, SUJ_FILE, dp->d_namlen) != 0) continue; sujino = dp->d_ino; return; } } /* * Orchestrate the verification of a filesystem via the softupdates journal. */ int suj_check(const char *filesys) { union dinode *jip; union dinode *ip; uint64_t blocks; int retval; struct suj_seg *seg; struct suj_seg *segn; initsuj(); opendisk(filesys); /* * Set an exit point when SUJ check failed */ retval = setjmp(jmpbuf); if (retval != 0) { pwarn("UNEXPECTED SU+J INCONSISTENCY\n"); TAILQ_FOREACH_SAFE(seg, &allsegs, ss_next, segn) { TAILQ_REMOVE(&allsegs, seg, ss_next); free(seg->ss_blk); free(seg); } if (reply("FALLBACK TO FULL FSCK") == 0) { ckfini(0); exit(EEXIT); } else return (-1); } /* * Find the journal inode. */ - ip = ino_read(ROOTINO); + ip = ino_read(UFS_ROOTINO); sujino = 0; - ino_visit(ip, ROOTINO, suj_find, 0); + ino_visit(ip, UFS_ROOTINO, suj_find, 0); if (sujino == 0) { printf("Journal inode removed. Use tunefs to re-create.\n"); sblock.fs_flags &= ~FS_SUJ; sblock.fs_sujfree = 0; return (-1); } /* * Fetch the journal inode and verify it. */ jip = ino_read(sujino); printf("** SU+J Recovering %s\n", filesys); if (suj_verifyino(jip) != 0) return (-1); /* * Build a list of journal blocks in jblocks before parsing the * available journal blocks in with suj_read(). */ printf("** Reading %jd byte journal from inode %ju.\n", DIP(jip, di_size), (uintmax_t)sujino); suj_jblocks = jblocks_create(); blocks = ino_visit(jip, sujino, suj_add_block, 0); if (blocks != numfrags(fs, DIP(jip, di_size))) { printf("Sparse journal inode %ju.\n", (uintmax_t)sujino); return (-1); } suj_read(); jblocks_destroy(suj_jblocks); suj_jblocks = NULL; if (preen || reply("RECOVER")) { printf("** Building recovery table.\n"); suj_prune(); suj_build(); cg_apply(cg_build); printf("** Resolving unreferenced inode list.\n"); ino_unlinked(); printf("** Processing journal entries.\n"); cg_apply(cg_trunc); cg_apply(cg_check_blk); cg_apply(cg_adj_blk); cg_apply(cg_check_ino); } if (preen == 0 && (jrecs > 0 || jbytes > 0) && reply("WRITE CHANGES") == 0) return (0); /* * To remain idempotent with partial truncations the free bitmaps * must be written followed by indirect blocks and lastly inode * blocks. This preserves access to the modified pointers until * they are freed. */ cg_apply(cg_write); dblk_write(); cg_apply(cg_write_inos); /* Write back superblock. */ closedisk(filesys); if (jrecs > 0 || jbytes > 0) { printf("** %jd journal records in %jd bytes for %.2f%% utilization\n", jrecs, jbytes, ((float)jrecs / (float)(jbytes / JREC_SIZE)) * 100); printf("** Freed %jd inodes (%jd dirs) %jd blocks, and %jd frags.\n", freeinos, freedir, freeblocks, freefrags); } return (0); } static void initsuj(void) { int i; for (i = 0; i < SUJ_HASHSIZE; i++) { LIST_INIT(&cghash[i]); LIST_INIT(&dbhash[i]); } lastcg = NULL; lastblk = NULL; TAILQ_INIT(&allsegs); oldseq = 0; disk = NULL; fs = NULL; sujino = 0; freefrags = 0; freeblocks = 0; freeinos = 0; freedir = 0; jbytes = 0; jrecs = 0; suj_jblocks = NULL; } Index: head/sbin/fsdb/fsdb.c =================================================================== --- head/sbin/fsdb/fsdb.c (revision 313779) +++ head/sbin/fsdb/fsdb.c (revision 313780) @@ -1,1210 +1,1210 @@ /* $NetBSD: fsdb.c,v 1.2 1995/10/08 23:18:10 thorpej Exp $ */ /* * Copyright (c) 1995 John T. Kohl * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the author may not be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR `AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #ifndef lint static const char rcsid[] = "$FreeBSD$"; #endif /* not lint */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include "fsdb.h" #include "fsck.h" static void usage(void) __dead2; int cmdloop(void); static int compare_blk32(uint32_t *wantedblk, uint32_t curblk); static int compare_blk64(uint64_t *wantedblk, uint64_t curblk); static int founddatablk(uint64_t blk); static int find_blks32(uint32_t *buf, int size, uint32_t *blknum); static int find_blks64(uint64_t *buf, int size, uint64_t *blknum); static int find_indirblks32(uint32_t blk, int ind_level, uint32_t *blknum); static int find_indirblks64(uint64_t blk, int ind_level, uint64_t *blknum); static void usage(void) { fprintf(stderr, "usage: fsdb [-d] [-f] [-r] fsname\n"); exit(1); } int returntosingle; char nflag; /* * We suck in lots of fsck code, and just pick & choose the stuff we want. * * fsreadfd is set up to read from the file system, fswritefd to write to * the file system. */ int main(int argc, char *argv[]) { int ch, rval; char *fsys = NULL; while (-1 != (ch = getopt(argc, argv, "fdr"))) { switch (ch) { case 'f': /* The -f option is left for historical * reasons and has no meaning. */ break; case 'd': debug++; break; case 'r': nflag++; /* "no" in fsck, readonly for us */ break; default: usage(); } } argc -= optind; argv += optind; if (argc != 1) usage(); else fsys = argv[0]; sblock_init(); if (!setup(fsys)) errx(1, "cannot set up file system `%s'", fsys); printf("%s file system `%s'\nLast Mounted on %s\n", nflag? "Examining": "Editing", fsys, sblock.fs_fsmnt); rval = cmdloop(); if (!nflag) { sblock.fs_clean = 0; /* mark it dirty */ sbdirty(); ckfini(0); printf("*** FILE SYSTEM MARKED DIRTY\n"); printf("*** BE SURE TO RUN FSCK TO CLEAN UP ANY DAMAGE\n"); printf("*** IF IT WAS MOUNTED, RE-MOUNT WITH -u -o reload\n"); } exit(rval); } #define CMDFUNC(func) int func(int argc, char *argv[]) #define CMDFUNCSTART(func) int func(int argc, char *argv[]) CMDFUNC(helpfn); CMDFUNC(focus); /* focus on inode */ CMDFUNC(active); /* print active inode */ CMDFUNC(blocks); /* print blocks for active inode */ CMDFUNC(focusname); /* focus by name */ CMDFUNC(zapi); /* clear inode */ CMDFUNC(uplink); /* incr link */ CMDFUNC(downlink); /* decr link */ CMDFUNC(linkcount); /* set link count */ CMDFUNC(quit); /* quit */ CMDFUNC(findblk); /* find block */ CMDFUNC(ls); /* list directory */ CMDFUNC(rm); /* remove name */ CMDFUNC(ln); /* add name */ CMDFUNC(newtype); /* change type */ CMDFUNC(chmode); /* change mode */ CMDFUNC(chlen); /* change length */ CMDFUNC(chaflags); /* change flags */ CMDFUNC(chgen); /* change generation */ CMDFUNC(chowner); /* change owner */ CMDFUNC(chgroup); /* Change group */ CMDFUNC(back); /* pop back to last ino */ CMDFUNC(chbtime); /* Change btime */ CMDFUNC(chmtime); /* Change mtime */ CMDFUNC(chctime); /* Change ctime */ CMDFUNC(chatime); /* Change atime */ CMDFUNC(chinum); /* Change inode # of dirent */ CMDFUNC(chname); /* Change dirname of dirent */ struct cmdtable cmds[] = { { "help", "Print out help", 1, 1, FL_RO, helpfn }, { "?", "Print out help", 1, 1, FL_RO, helpfn }, { "inode", "Set active inode to INUM", 2, 2, FL_RO, focus }, { "clri", "Clear inode INUM", 2, 2, FL_WR, zapi }, { "lookup", "Set active inode by looking up NAME", 2, 2, FL_RO | FL_ST, focusname }, { "cd", "Set active inode by looking up NAME", 2, 2, FL_RO | FL_ST, focusname }, { "back", "Go to previous active inode", 1, 1, FL_RO, back }, { "active", "Print active inode", 1, 1, FL_RO, active }, { "print", "Print active inode", 1, 1, FL_RO, active }, { "blocks", "Print block numbers of active inode", 1, 1, FL_RO, blocks }, { "uplink", "Increment link count", 1, 1, FL_WR, uplink }, { "downlink", "Decrement link count", 1, 1, FL_WR, downlink }, { "linkcount", "Set link count to COUNT", 2, 2, FL_WR, linkcount }, { "findblk", "Find inode owning disk block(s)", 2, 33, FL_RO, findblk}, { "ls", "List current inode as directory", 1, 1, FL_RO, ls }, { "rm", "Remove NAME from current inode directory", 2, 2, FL_WR | FL_ST, rm }, { "del", "Remove NAME from current inode directory", 2, 2, FL_WR | FL_ST, rm }, { "ln", "Hardlink INO into current inode directory as NAME", 3, 3, FL_WR | FL_ST, ln }, { "chinum", "Change dir entry number INDEX to INUM", 3, 3, FL_WR, chinum }, { "chname", "Change dir entry number INDEX to NAME", 3, 3, FL_WR | FL_ST, chname }, { "chtype", "Change type of current inode to TYPE", 2, 2, FL_WR, newtype }, { "chmod", "Change mode of current inode to MODE", 2, 2, FL_WR, chmode }, { "chlen", "Change length of current inode to LENGTH", 2, 2, FL_WR, chlen }, { "chown", "Change owner of current inode to OWNER", 2, 2, FL_WR, chowner }, { "chgrp", "Change group of current inode to GROUP", 2, 2, FL_WR, chgroup }, { "chflags", "Change flags of current inode to FLAGS", 2, 2, FL_WR, chaflags }, { "chgen", "Change generation number of current inode to GEN", 2, 2, FL_WR, chgen }, { "btime", "Change btime of current inode to BTIME", 2, 2, FL_WR, chbtime }, { "mtime", "Change mtime of current inode to MTIME", 2, 2, FL_WR, chmtime }, { "ctime", "Change ctime of current inode to CTIME", 2, 2, FL_WR, chctime }, { "atime", "Change atime of current inode to ATIME", 2, 2, FL_WR, chatime }, { "quit", "Exit", 1, 1, FL_RO, quit }, { "q", "Exit", 1, 1, FL_RO, quit }, { "exit", "Exit", 1, 1, FL_RO, quit }, { NULL, 0, 0, 0, 0, NULL }, }; int helpfn(int argc, char *argv[]) { struct cmdtable *cmdtp; printf("Commands are:\n%-10s %5s %5s %s\n", "command", "min args", "max args", "what"); for (cmdtp = cmds; cmdtp->cmd; cmdtp++) printf("%-10s %5u %5u %s\n", cmdtp->cmd, cmdtp->minargc-1, cmdtp->maxargc-1, cmdtp->helptxt); return 0; } char * prompt(EditLine *el) { static char pstring[64]; snprintf(pstring, sizeof(pstring), "fsdb (inum: %ju)> ", (uintmax_t)curinum); return pstring; } int cmdloop(void) { char *line; const char *elline; int cmd_argc, rval = 0, known; #define scratch known char **cmd_argv; struct cmdtable *cmdp; History *hist; EditLine *elptr; HistEvent he; - curinode = ginode(ROOTINO); - curinum = ROOTINO; + curinode = ginode(UFS_ROOTINO); + curinum = UFS_ROOTINO; printactive(0); hist = history_init(); history(hist, &he, H_SETSIZE, 100); /* 100 elt history buffer */ elptr = el_init("fsdb", stdin, stdout, stderr); el_set(elptr, EL_EDITOR, "emacs"); el_set(elptr, EL_PROMPT, prompt); el_set(elptr, EL_HIST, history, hist); el_source(elptr, NULL); while ((elline = el_gets(elptr, &scratch)) != NULL && scratch != 0) { if (debug) printf("command `%s'\n", elline); history(hist, &he, H_ENTER, elline); line = strdup(elline); cmd_argv = crack(line, &cmd_argc); /* * el_parse returns -1 to signal that it's not been handled * internally. */ if (el_parse(elptr, cmd_argc, (const char **)cmd_argv) != -1) continue; if (cmd_argc) { known = 0; for (cmdp = cmds; cmdp->cmd; cmdp++) { if (!strcmp(cmdp->cmd, cmd_argv[0])) { if ((cmdp->flags & FL_WR) == FL_WR && nflag) warnx("`%s' requires write access", cmd_argv[0]), rval = 1; else if (cmd_argc >= cmdp->minargc && cmd_argc <= cmdp->maxargc) rval = (*cmdp->handler)(cmd_argc, cmd_argv); else if (cmd_argc >= cmdp->minargc && (cmdp->flags & FL_ST) == FL_ST) { strcpy(line, elline); cmd_argv = recrack(line, &cmd_argc, cmdp->maxargc); rval = (*cmdp->handler)(cmd_argc, cmd_argv); } else rval = argcount(cmdp, cmd_argc, cmd_argv); known = 1; break; } } if (!known) warnx("unknown command `%s'", cmd_argv[0]), rval = 1; } else rval = 0; free(line); if (rval < 0) /* user typed "quit" */ return 0; if (rval) warnx("rval was %d", rval); } el_end(elptr); history_end(hist); return rval; } union dinode *curinode; ino_t curinum, ocurrent; #define GETINUM(ac,inum) inum = strtoul(argv[ac], &cp, 0); \ - if (inum < ROOTINO || inum > maxino || cp == argv[ac] || *cp != '\0' ) { \ +if (inum < UFS_ROOTINO || inum > maxino || cp == argv[ac] || *cp != '\0' ) { \ printf("inode %ju out of range; range is [%ju,%ju]\n", \ - (uintmax_t)inum, (uintmax_t)ROOTINO, (uintmax_t)maxino); \ + (uintmax_t)inum, (uintmax_t)UFS_ROOTINO, (uintmax_t)maxino);\ return 1; \ - } +} /* * Focus on given inode number */ CMDFUNCSTART(focus) { ino_t inum; char *cp; GETINUM(1,inum); curinode = ginode(inum); ocurrent = curinum; curinum = inum; printactive(0); return 0; } CMDFUNCSTART(back) { curinum = ocurrent; curinode = ginode(curinum); printactive(0); return 0; } CMDFUNCSTART(zapi) { ino_t inum; union dinode *dp; char *cp; GETINUM(1,inum); dp = ginode(inum); clearinode(dp); inodirty(); if (curinode) /* re-set after potential change */ curinode = ginode(curinum); return 0; } CMDFUNCSTART(active) { printactive(0); return 0; } CMDFUNCSTART(blocks) { printactive(1); return 0; } CMDFUNCSTART(quit) { return -1; } CMDFUNCSTART(uplink) { if (!checkactive()) return 1; DIP_SET(curinode, di_nlink, DIP(curinode, di_nlink) + 1); printf("inode %ju link count now %d\n", (uintmax_t)curinum, DIP(curinode, di_nlink)); inodirty(); return 0; } CMDFUNCSTART(downlink) { if (!checkactive()) return 1; DIP_SET(curinode, di_nlink, DIP(curinode, di_nlink) - 1); printf("inode %ju link count now %d\n", (uintmax_t)curinum, DIP(curinode, di_nlink)); inodirty(); return 0; } const char *typename[] = { "unknown", "fifo", "char special", "unregistered #3", "directory", "unregistered #5", "blk special", "unregistered #7", "regular", "unregistered #9", "symlink", "unregistered #11", "socket", "unregistered #13", "whiteout", }; int diroff; int slot; int scannames(struct inodesc *idesc) { struct direct *dirp = idesc->id_dirp; printf("slot %d off %d ino %d reclen %d: %s, `%.*s'\n", slot++, diroff, dirp->d_ino, dirp->d_reclen, typename[dirp->d_type], dirp->d_namlen, dirp->d_name); diroff += dirp->d_reclen; return (KEEPON); } CMDFUNCSTART(ls) { struct inodesc idesc; checkactivedir(); /* let it go on anyway */ slot = 0; diroff = 0; idesc.id_number = curinum; idesc.id_func = scannames; idesc.id_type = DATA; idesc.id_fix = IGNORE; ckinode(curinode, &idesc); curinode = ginode(curinum); return 0; } static int findblk_numtofind; static int wantedblksize; CMDFUNCSTART(findblk) { ino_t inum, inosused; uint32_t *wantedblk32; uint64_t *wantedblk64; struct bufarea *cgbp; struct cg *cgp; int c, i, is_ufs2; wantedblksize = (argc - 1); is_ufs2 = sblock.fs_magic == FS_UFS2_MAGIC; ocurrent = curinum; if (is_ufs2) { wantedblk64 = calloc(wantedblksize, sizeof(uint64_t)); if (wantedblk64 == NULL) err(1, "malloc"); for (i = 1; i < argc; i++) wantedblk64[i - 1] = dbtofsb(&sblock, strtoull(argv[i], NULL, 0)); } else { wantedblk32 = calloc(wantedblksize, sizeof(uint32_t)); if (wantedblk32 == NULL) err(1, "malloc"); for (i = 1; i < argc; i++) wantedblk32[i - 1] = dbtofsb(&sblock, strtoull(argv[i], NULL, 0)); } findblk_numtofind = wantedblksize; /* * sblock.fs_ncg holds a number of cylinder groups. * Iterate over all cylinder groups. */ for (c = 0; c < sblock.fs_ncg; c++) { /* * sblock.fs_ipg holds a number of inodes per cylinder group. * Calculate a highest inode number for a given cylinder group. */ inum = c * sblock.fs_ipg; /* Read cylinder group. */ cgbp = cgget(c); cgp = cgbp->b_un.b_cg; /* * Get a highest used inode number for a given cylinder group. * For UFS1 all inodes initialized at the newfs stage. */ if (is_ufs2) inosused = cgp->cg_initediblk; else inosused = sblock.fs_ipg; for (; inosused > 0; inum++, inosused--) { - /* Skip magic inodes: 0, WINO, ROOTINO. */ - if (inum < ROOTINO) + /* Skip magic inodes: 0, UFS_WINO, UFS_ROOTINO. */ + if (inum < UFS_ROOTINO) continue; /* * Check if the block we are looking for is just an inode block. * * ino_to_fsba() - get block containing inode from its number. * INOPB() - get a number of inodes in one disk block. */ if (is_ufs2 ? compare_blk64(wantedblk64, ino_to_fsba(&sblock, inum)) : compare_blk32(wantedblk32, ino_to_fsba(&sblock, inum))) { printf("block %llu: inode block (%ju-%ju)\n", (unsigned long long)fsbtodb(&sblock, ino_to_fsba(&sblock, inum)), (uintmax_t)(inum / INOPB(&sblock)) * INOPB(&sblock), (uintmax_t)(inum / INOPB(&sblock) + 1) * INOPB(&sblock)); findblk_numtofind--; if (findblk_numtofind == 0) goto end; } /* Get on-disk inode aka dinode. */ curinum = inum; curinode = ginode(inum); /* Find IFLNK dinode with allocated data blocks. */ switch (DIP(curinode, di_mode) & IFMT) { case IFDIR: case IFREG: if (DIP(curinode, di_blocks) == 0) continue; break; case IFLNK: { uint64_t size = DIP(curinode, di_size); if (size > 0 && size < sblock.fs_maxsymlinklen && DIP(curinode, di_blocks) == 0) continue; else break; } default: continue; } /* Look through direct data blocks. */ if (is_ufs2 ? - find_blks64(curinode->dp2.di_db, NDADDR, wantedblk64) : - find_blks32(curinode->dp1.di_db, NDADDR, wantedblk32)) + find_blks64(curinode->dp2.di_db, UFS_NDADDR, wantedblk64) : + find_blks32(curinode->dp1.di_db, UFS_NDADDR, wantedblk32)) goto end; - for (i = 0; i < NIADDR; i++) { + for (i = 0; i < UFS_NIADDR; i++) { /* * Does the block we are looking for belongs to the * indirect blocks? */ if (is_ufs2 ? compare_blk64(wantedblk64, curinode->dp2.di_ib[i]) : compare_blk32(wantedblk32, curinode->dp1.di_ib[i])) if (founddatablk(is_ufs2 ? curinode->dp2.di_ib[i] : curinode->dp1.di_ib[i])) goto end; /* * Search through indirect, double and triple indirect * data blocks. */ if (is_ufs2 ? (curinode->dp2.di_ib[i] != 0) : (curinode->dp1.di_ib[i] != 0)) if (is_ufs2 ? find_indirblks64(curinode->dp2.di_ib[i], i, wantedblk64) : find_indirblks32(curinode->dp1.di_ib[i], i, wantedblk32)) goto end; } } } end: curinum = ocurrent; curinode = ginode(curinum); return 0; } static int compare_blk32(uint32_t *wantedblk, uint32_t curblk) { int i; for (i = 0; i < wantedblksize; i++) { if (wantedblk[i] != 0 && wantedblk[i] == curblk) { wantedblk[i] = 0; return 1; } } return 0; } static int compare_blk64(uint64_t *wantedblk, uint64_t curblk) { int i; for (i = 0; i < wantedblksize; i++) { if (wantedblk[i] != 0 && wantedblk[i] == curblk) { wantedblk[i] = 0; return 1; } } return 0; } static int founddatablk(uint64_t blk) { printf("%llu: data block of inode %ju\n", (unsigned long long)fsbtodb(&sblock, blk), (uintmax_t)curinum); findblk_numtofind--; if (findblk_numtofind == 0) return 1; return 0; } static int find_blks32(uint32_t *buf, int size, uint32_t *wantedblk) { int blk; for (blk = 0; blk < size; blk++) { if (buf[blk] == 0) continue; if (compare_blk32(wantedblk, buf[blk])) { if (founddatablk(buf[blk])) return 1; } } return 0; } static int find_indirblks32(uint32_t blk, int ind_level, uint32_t *wantedblk) { #define MAXNINDIR (MAXBSIZE / sizeof(uint32_t)) uint32_t idblk[MAXNINDIR]; int i; blread(fsreadfd, (char *)idblk, fsbtodb(&sblock, blk), (int)sblock.fs_bsize); if (ind_level <= 0) { if (find_blks32(idblk, sblock.fs_bsize / sizeof(uint32_t), wantedblk)) return 1; } else { ind_level--; for (i = 0; i < sblock.fs_bsize / sizeof(uint32_t); i++) { if (compare_blk32(wantedblk, idblk[i])) { if (founddatablk(idblk[i])) return 1; } if (idblk[i] != 0) if (find_indirblks32(idblk[i], ind_level, wantedblk)) return 1; } } #undef MAXNINDIR return 0; } static int find_blks64(uint64_t *buf, int size, uint64_t *wantedblk) { int blk; for (blk = 0; blk < size; blk++) { if (buf[blk] == 0) continue; if (compare_blk64(wantedblk, buf[blk])) { if (founddatablk(buf[blk])) return 1; } } return 0; } static int find_indirblks64(uint64_t blk, int ind_level, uint64_t *wantedblk) { #define MAXNINDIR (MAXBSIZE / sizeof(uint64_t)) uint64_t idblk[MAXNINDIR]; int i; blread(fsreadfd, (char *)idblk, fsbtodb(&sblock, blk), (int)sblock.fs_bsize); if (ind_level <= 0) { if (find_blks64(idblk, sblock.fs_bsize / sizeof(uint64_t), wantedblk)) return 1; } else { ind_level--; for (i = 0; i < sblock.fs_bsize / sizeof(uint64_t); i++) { if (compare_blk64(wantedblk, idblk[i])) { if (founddatablk(idblk[i])) return 1; } if (idblk[i] != 0) if (find_indirblks64(idblk[i], ind_level, wantedblk)) return 1; } } #undef MAXNINDIR return 0; } int findino(struct inodesc *idesc); /* from fsck */ static int dolookup(char *name); static int dolookup(char *name) { struct inodesc idesc; if (!checkactivedir()) return 0; idesc.id_number = curinum; idesc.id_func = findino; idesc.id_name = name; idesc.id_type = DATA; idesc.id_fix = IGNORE; if (ckinode(curinode, &idesc) & FOUND) { curinum = idesc.id_parent; curinode = ginode(curinum); printactive(0); return 1; } else { warnx("name `%s' not found in current inode directory", name); return 0; } } CMDFUNCSTART(focusname) { char *p, *val; if (!checkactive()) return 1; ocurrent = curinum; if (argv[1][0] == '/') { - curinum = ROOTINO; - curinode = ginode(ROOTINO); + curinum = UFS_ROOTINO; + curinode = ginode(UFS_ROOTINO); } else { if (!checkactivedir()) return 1; } for (p = argv[1]; p != NULL;) { while ((val = strsep(&p, "/")) != NULL && *val == '\0'); if (val) { printf("component `%s': ", val); fflush(stdout); if (!dolookup(val)) { curinode = ginode(curinum); return(1); } } } return 0; } CMDFUNCSTART(ln) { ino_t inum; int rval; char *cp; GETINUM(1,inum); if (!checkactivedir()) return 1; rval = makeentry(curinum, inum, argv[2]); if (rval) printf("Ino %ju entered as `%s'\n", (uintmax_t)inum, argv[2]); else printf("could not enter name? weird.\n"); curinode = ginode(curinum); return rval; } CMDFUNCSTART(rm) { int rval; if (!checkactivedir()) return 1; rval = changeino(curinum, argv[1], 0); if (rval & ALTERED) { printf("Name `%s' removed\n", argv[1]); return 0; } else { printf("could not remove name ('%s')? weird.\n", argv[1]); return 1; } } long slotcount, desired; int chinumfunc(struct inodesc *idesc) { struct direct *dirp = idesc->id_dirp; if (slotcount++ == desired) { dirp->d_ino = idesc->id_parent; return STOP|ALTERED|FOUND; } return KEEPON; } CMDFUNCSTART(chinum) { char *cp; ino_t inum; struct inodesc idesc; slotcount = 0; if (!checkactivedir()) return 1; GETINUM(2,inum); desired = strtol(argv[1], &cp, 0); if (cp == argv[1] || *cp != '\0' || desired < 0) { printf("invalid slot number `%s'\n", argv[1]); return 1; } idesc.id_number = curinum; idesc.id_func = chinumfunc; idesc.id_fix = IGNORE; idesc.id_type = DATA; idesc.id_parent = inum; /* XXX convenient hiding place */ if (ckinode(curinode, &idesc) & FOUND) return 0; else { warnx("no %sth slot in current directory", argv[1]); return 1; } } int chnamefunc(struct inodesc *idesc) { struct direct *dirp = idesc->id_dirp; struct direct testdir; if (slotcount++ == desired) { /* will name fit? */ testdir.d_namlen = strlen(idesc->id_name); if (DIRSIZ(NEWDIRFMT, &testdir) <= dirp->d_reclen) { dirp->d_namlen = testdir.d_namlen; strcpy(dirp->d_name, idesc->id_name); return STOP|ALTERED|FOUND; } else return STOP|FOUND; /* won't fit, so give up */ } return KEEPON; } CMDFUNCSTART(chname) { int rval; char *cp; struct inodesc idesc; slotcount = 0; if (!checkactivedir()) return 1; desired = strtoul(argv[1], &cp, 0); if (cp == argv[1] || *cp != '\0') { printf("invalid slot number `%s'\n", argv[1]); return 1; } idesc.id_number = curinum; idesc.id_func = chnamefunc; idesc.id_fix = IGNORE; idesc.id_type = DATA; idesc.id_name = argv[2]; rval = ckinode(curinode, &idesc); if ((rval & (FOUND|ALTERED)) == (FOUND|ALTERED)) return 0; else if (rval & FOUND) { warnx("new name `%s' does not fit in slot %s\n", argv[2], argv[1]); return 1; } else { warnx("no %sth slot in current directory", argv[1]); return 1; } } struct typemap { const char *typename; int typebits; } typenamemap[] = { {"file", IFREG}, {"dir", IFDIR}, {"socket", IFSOCK}, {"fifo", IFIFO}, }; CMDFUNCSTART(newtype) { int type; struct typemap *tp; if (!checkactive()) return 1; type = DIP(curinode, di_mode) & IFMT; for (tp = typenamemap; tp < &typenamemap[nitems(typenamemap)]; tp++) { if (!strcmp(argv[1], tp->typename)) { printf("setting type to %s\n", tp->typename); type = tp->typebits; break; } } if (tp == &typenamemap[nitems(typenamemap)]) { warnx("type `%s' not known", argv[1]); warnx("try one of `file', `dir', `socket', `fifo'"); return 1; } DIP_SET(curinode, di_mode, DIP(curinode, di_mode) & ~IFMT); DIP_SET(curinode, di_mode, DIP(curinode, di_mode) | type); inodirty(); printactive(0); return 0; } CMDFUNCSTART(chlen) { int rval = 1; long len; char *cp; if (!checkactive()) return 1; len = strtol(argv[1], &cp, 0); if (cp == argv[1] || *cp != '\0' || len < 0) { warnx("bad length `%s'", argv[1]); return 1; } DIP_SET(curinode, di_size, len); inodirty(); printactive(0); return rval; } CMDFUNCSTART(chmode) { int rval = 1; long modebits; char *cp; if (!checkactive()) return 1; modebits = strtol(argv[1], &cp, 8); if (cp == argv[1] || *cp != '\0' || (modebits & ~07777)) { warnx("bad modebits `%s'", argv[1]); return 1; } DIP_SET(curinode, di_mode, DIP(curinode, di_mode) & ~07777); DIP_SET(curinode, di_mode, DIP(curinode, di_mode) | modebits); inodirty(); printactive(0); return rval; } CMDFUNCSTART(chaflags) { int rval = 1; u_long flags; char *cp; if (!checkactive()) return 1; flags = strtoul(argv[1], &cp, 0); if (cp == argv[1] || *cp != '\0' ) { warnx("bad flags `%s'", argv[1]); return 1; } if (flags > UINT_MAX) { warnx("flags set beyond 32-bit range of field (%lx)\n", flags); return(1); } DIP_SET(curinode, di_flags, flags); inodirty(); printactive(0); return rval; } CMDFUNCSTART(chgen) { int rval = 1; long gen; char *cp; if (!checkactive()) return 1; gen = strtol(argv[1], &cp, 0); if (cp == argv[1] || *cp != '\0' ) { warnx("bad gen `%s'", argv[1]); return 1; } if (gen > INT_MAX || gen < INT_MIN) { warnx("gen set beyond 32-bit range of field (%lx)\n", gen); return(1); } DIP_SET(curinode, di_gen, gen); inodirty(); printactive(0); return rval; } CMDFUNCSTART(linkcount) { int rval = 1; int lcnt; char *cp; if (!checkactive()) return 1; lcnt = strtol(argv[1], &cp, 0); if (cp == argv[1] || *cp != '\0' ) { warnx("bad link count `%s'", argv[1]); return 1; } if (lcnt > USHRT_MAX || lcnt < 0) { warnx("max link count is %d\n", USHRT_MAX); return 1; } DIP_SET(curinode, di_nlink, lcnt); inodirty(); printactive(0); return rval; } CMDFUNCSTART(chowner) { int rval = 1; unsigned long uid; char *cp; struct passwd *pwd; if (!checkactive()) return 1; uid = strtoul(argv[1], &cp, 0); if (cp == argv[1] || *cp != '\0' ) { /* try looking up name */ if ((pwd = getpwnam(argv[1]))) { uid = pwd->pw_uid; } else { warnx("bad uid `%s'", argv[1]); return 1; } } DIP_SET(curinode, di_uid, uid); inodirty(); printactive(0); return rval; } CMDFUNCSTART(chgroup) { int rval = 1; unsigned long gid; char *cp; struct group *grp; if (!checkactive()) return 1; gid = strtoul(argv[1], &cp, 0); if (cp == argv[1] || *cp != '\0' ) { if ((grp = getgrnam(argv[1]))) { gid = grp->gr_gid; } else { warnx("bad gid `%s'", argv[1]); return 1; } } DIP_SET(curinode, di_gid, gid); inodirty(); printactive(0); return rval; } int dotime(char *name, time_t *secp, int32_t *nsecp) { char *p, *val; struct tm t; int32_t nsec; p = strchr(name, '.'); if (p) { *p = '\0'; nsec = strtoul(++p, &val, 0); if (val == p || *val != '\0' || nsec >= 1000000000 || nsec < 0) { warnx("invalid nanoseconds"); goto badformat; } } else nsec = 0; if (strlen(name) != 14) { badformat: warnx("date format: YYYYMMDDHHMMSS[.nsec]"); return 1; } *nsecp = nsec; for (p = name; *p; p++) if (*p < '0' || *p > '9') goto badformat; p = name; #define VAL() ((*p++) - '0') t.tm_year = VAL(); t.tm_year = VAL() + t.tm_year * 10; t.tm_year = VAL() + t.tm_year * 10; t.tm_year = VAL() + t.tm_year * 10 - 1900; t.tm_mon = VAL(); t.tm_mon = VAL() + t.tm_mon * 10 - 1; t.tm_mday = VAL(); t.tm_mday = VAL() + t.tm_mday * 10; t.tm_hour = VAL(); t.tm_hour = VAL() + t.tm_hour * 10; t.tm_min = VAL(); t.tm_min = VAL() + t.tm_min * 10; t.tm_sec = VAL(); t.tm_sec = VAL() + t.tm_sec * 10; t.tm_isdst = -1; *secp = mktime(&t); if (*secp == -1) { warnx("date/time out of range"); return 1; } return 0; } CMDFUNCSTART(chbtime) { time_t secs; int32_t nsecs; if (dotime(argv[1], &secs, &nsecs)) return 1; if (sblock.fs_magic == FS_UFS1_MAGIC) return 1; curinode->dp2.di_birthtime = _time_to_time64(secs); curinode->dp2.di_birthnsec = nsecs; inodirty(); printactive(0); return 0; } CMDFUNCSTART(chmtime) { time_t secs; int32_t nsecs; if (dotime(argv[1], &secs, &nsecs)) return 1; if (sblock.fs_magic == FS_UFS1_MAGIC) curinode->dp1.di_mtime = _time_to_time32(secs); else curinode->dp2.di_mtime = _time_to_time64(secs); DIP_SET(curinode, di_mtimensec, nsecs); inodirty(); printactive(0); return 0; } CMDFUNCSTART(chatime) { time_t secs; int32_t nsecs; if (dotime(argv[1], &secs, &nsecs)) return 1; if (sblock.fs_magic == FS_UFS1_MAGIC) curinode->dp1.di_atime = _time_to_time32(secs); else curinode->dp2.di_atime = _time_to_time64(secs); DIP_SET(curinode, di_atimensec, nsecs); inodirty(); printactive(0); return 0; } CMDFUNCSTART(chctime) { time_t secs; int32_t nsecs; if (dotime(argv[1], &secs, &nsecs)) return 1; if (sblock.fs_magic == FS_UFS1_MAGIC) curinode->dp1.di_ctime = _time_to_time32(secs); else curinode->dp2.di_ctime = _time_to_time64(secs); DIP_SET(curinode, di_ctimensec, nsecs); inodirty(); printactive(0); return 0; } Index: head/sbin/fsdb/fsdbutil.c =================================================================== --- head/sbin/fsdb/fsdbutil.c (revision 313779) +++ head/sbin/fsdb/fsdbutil.c (revision 313780) @@ -1,374 +1,374 @@ /* $NetBSD: fsdbutil.c,v 1.2 1995/10/08 23:18:12 thorpej Exp $ */ /* * Copyright (c) 1995 John T. Kohl * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the author may not be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR `AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #ifndef lint static const char rcsid[] = "$FreeBSD$"; #endif /* not lint */ #include #include #include #include #include #include #include #include #include #include #include #include #include "fsdb.h" #include "fsck.h" static int charsperline(void); static void printindir(ufs2_daddr_t blk, int level, char *bufp); static void printblocks(ino_t inum, union dinode *dp); char ** crack(char *line, int *argc) { static char *argv[8]; int i; char *p, *val; for (p = line, i = 0; p != NULL && i < 8; i++) { while ((val = strsep(&p, " \t\n")) != NULL && *val == '\0') /**/; if (val) argv[i] = val; else break; } *argc = i; return argv; } char ** recrack(char *line, int *argc, int argc_max) { static char *argv[8]; int i; char *p, *val; for (p = line, i = 0; p != NULL && i < 8 && i < argc_max - 1; i++) { while ((val = strsep(&p, " \t\n")) != NULL && *val == '\0') /**/; if (val) argv[i] = val; else break; } argv[i] = argv[i - 1] + strlen(argv[i - 1]) + 1; argv[i][strcspn(argv[i], "\n")] = '\0'; *argc = i + 1; return argv; } int argcount(struct cmdtable *cmdp, int argc, char *argv[]) { if (cmdp->minargc == cmdp->maxargc) warnx("command `%s' takes %u arguments, got %u", cmdp->cmd, cmdp->minargc-1, argc-1); else warnx("command `%s' takes from %u to %u arguments", cmdp->cmd, cmdp->minargc-1, cmdp->maxargc-1); warnx("usage: %s: %s", cmdp->cmd, cmdp->helptxt); return 1; } void printstat(const char *cp, ino_t inum, union dinode *dp) { struct group *grp; struct passwd *pw; ufs2_daddr_t blocks; int64_t gen; char *p; time_t t; printf("%s: ", cp); switch (DIP(dp, di_mode) & IFMT) { case IFDIR: puts("directory"); break; case IFREG: puts("regular file"); break; case IFBLK: printf("block special (%#jx)", (uintmax_t)DIP(dp, di_rdev)); break; case IFCHR: printf("character special (%#jx)", DIP(dp, di_rdev)); break; case IFLNK: fputs("symlink",stdout); if (DIP(dp, di_size) > 0 && DIP(dp, di_size) < sblock.fs_maxsymlinklen && DIP(dp, di_blocks) == 0) { if (sblock.fs_magic == FS_UFS1_MAGIC) p = (caddr_t)dp->dp1.di_db; else p = (caddr_t)dp->dp2.di_db; printf(" to `%.*s'\n", (int) DIP(dp, di_size), p); } else { putchar('\n'); } break; case IFSOCK: puts("socket"); break; case IFIFO: puts("fifo"); break; } printf("I=%ju MODE=%o SIZE=%ju", (uintmax_t)inum, DIP(dp, di_mode), (uintmax_t)DIP(dp, di_size)); if (sblock.fs_magic != FS_UFS1_MAGIC) { t = _time64_to_time(dp->dp2.di_birthtime); p = ctime(&t); printf("\n\tBTIME=%15.15s %4.4s [%d nsec]", &p[4], &p[20], dp->dp2.di_birthnsec); } if (sblock.fs_magic == FS_UFS1_MAGIC) t = _time32_to_time(dp->dp1.di_mtime); else t = _time64_to_time(dp->dp2.di_mtime); p = ctime(&t); printf("\n\tMTIME=%15.15s %4.4s [%d nsec]", &p[4], &p[20], DIP(dp, di_mtimensec)); if (sblock.fs_magic == FS_UFS1_MAGIC) t = _time32_to_time(dp->dp1.di_ctime); else t = _time64_to_time(dp->dp2.di_ctime); p = ctime(&t); printf("\n\tCTIME=%15.15s %4.4s [%d nsec]", &p[4], &p[20], DIP(dp, di_ctimensec)); if (sblock.fs_magic == FS_UFS1_MAGIC) t = _time32_to_time(dp->dp1.di_atime); else t = _time64_to_time(dp->dp2.di_atime); p = ctime(&t); printf("\n\tATIME=%15.15s %4.4s [%d nsec]\n", &p[4], &p[20], DIP(dp, di_atimensec)); if ((pw = getpwuid(DIP(dp, di_uid)))) printf("OWNER=%s ", pw->pw_name); else printf("OWNUID=%u ", DIP(dp, di_uid)); if ((grp = getgrgid(DIP(dp, di_gid)))) printf("GRP=%s ", grp->gr_name); else printf("GID=%u ", DIP(dp, di_gid)); blocks = DIP(dp, di_blocks); gen = DIP(dp, di_gen); printf("LINKCNT=%d FLAGS=%#x BLKCNT=%jx GEN=%jx\n", DIP(dp, di_nlink), DIP(dp, di_flags), (intmax_t)blocks, (intmax_t)gen); } /* * Determine the number of characters in a * single line. */ static int charsperline(void) { int columns; char *cp; struct winsize ws; columns = 0; if (ioctl(0, TIOCGWINSZ, &ws) != -1) columns = ws.ws_col; if (columns == 0 && (cp = getenv("COLUMNS"))) columns = atoi(cp); if (columns == 0) columns = 80; /* last resort */ return (columns); } /* * Recursively print a list of indirect blocks. */ static void printindir(ufs2_daddr_t blk, int level, char *bufp) { struct bufarea buf, *bp; char tempbuf[32]; /* enough to print an ufs2_daddr_t */ int i, j, cpl, charssofar; ufs2_daddr_t blkno; if (blk == 0) return; printf("%jd (%d) =>\n", (intmax_t)blk, level); if (level == 0) { /* for the final indirect level, don't use the cache */ bp = &buf; bp->b_un.b_buf = bufp; initbarea(bp, BT_UNKNOWN); getblk(bp, blk, sblock.fs_bsize); } else bp = getdatablk(blk, sblock.fs_bsize, BT_UNKNOWN); cpl = charsperline(); for (i = charssofar = 0; i < NINDIR(&sblock); i++) { if (sblock.fs_magic == FS_UFS1_MAGIC) blkno = bp->b_un.b_indir1[i]; else blkno = bp->b_un.b_indir2[i]; if (blkno == 0) continue; j = sprintf(tempbuf, "%jd", (intmax_t)blkno); if (level == 0) { charssofar += j; if (charssofar >= cpl - 2) { putchar('\n'); charssofar = j; } } fputs(tempbuf, stdout); if (level == 0) { printf(", "); charssofar += 2; } else { printf(" =>\n"); printindir(blkno, level - 1, bufp); printf("\n"); charssofar = 0; } } if (level == 0) putchar('\n'); return; } /* * Print the block pointers for one inode. */ static void printblocks(ino_t inum, union dinode *dp) { char *bufp; int i, nfrags; long ndb, offset; ufs2_daddr_t blkno; printf("Blocks for inode %ju:\n", (uintmax_t)inum); printf("Direct blocks:\n"); ndb = howmany(DIP(dp, di_size), sblock.fs_bsize); - for (i = 0; i < NDADDR && i < ndb; i++) { + for (i = 0; i < UFS_NDADDR && i < ndb; i++) { if (i > 0) printf(", "); blkno = DIP(dp, di_db[i]); printf("%jd", (intmax_t)blkno); } - if (ndb <= NDADDR) { + if (ndb <= UFS_NDADDR) { offset = blkoff(&sblock, DIP(dp, di_size)); if (offset != 0) { nfrags = numfrags(&sblock, fragroundup(&sblock, offset)); printf(" (%d frag%s)", nfrags, nfrags > 1? "s": ""); } } putchar('\n'); - if (ndb <= NDADDR) + if (ndb <= UFS_NDADDR) return; bufp = malloc((unsigned int)sblock.fs_bsize); if (bufp == NULL) errx(EEXIT, "cannot allocate indirect block buffer"); printf("Indirect blocks:\n"); - for (i = 0; i < NIADDR; i++) + for (i = 0; i < UFS_NIADDR; i++) printindir(DIP(dp, di_ib[i]), i, bufp); free(bufp); } int checkactive(void) { if (!curinode) { warnx("no current inode\n"); return 0; } return 1; } int checkactivedir(void) { if (!curinode) { warnx("no current inode\n"); return 0; } if ((DIP(curinode, di_mode) & IFMT) != IFDIR) { warnx("inode %ju not a directory", (uintmax_t)curinum); return 0; } return 1; } int printactive(int doblocks) { if (!checkactive()) return 1; switch (DIP(curinode, di_mode) & IFMT) { case IFDIR: case IFREG: case IFBLK: case IFCHR: case IFLNK: case IFSOCK: case IFIFO: if (doblocks) printblocks(curinum, curinode); else printstat("current inode", curinum, curinode); break; case 0: printf("current inode %ju: unallocated inode\n", (uintmax_t)curinum); break; default: printf("current inode %ju: screwy itype 0%o (mode 0%o)?\n", (uintmax_t)curinum, DIP(curinode, di_mode) & IFMT, DIP(curinode, di_mode)); break; } return 0; } Index: head/sbin/fsirand/fsirand.c =================================================================== --- head/sbin/fsirand/fsirand.c (revision 313779) +++ head/sbin/fsirand/fsirand.c (revision 313780) @@ -1,303 +1,303 @@ /* $OpenBSD: fsirand.c,v 1.9 1997/02/28 00:46:33 millert Exp $ */ /* * Copyright (c) 1997 Todd C. Miller * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by Todd C. Miller. * 4. The name of the author may not be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL * THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef lint static const char rcsid[] = "$FreeBSD$"; #endif /* not lint */ #include #include #include #include #include #include #include #include #include #include #include #include #include static void usage(void) __dead2; int fsirand(char *); /* * Possible superblock locations ordered from most to least likely. */ static int sblock_try[] = SBLOCKSEARCH; static int printonly = 0, force = 0, ignorelabel = 0; int main(int argc, char *argv[]) { int n, ex = 0; struct rlimit rl; while ((n = getopt(argc, argv, "bfp")) != -1) { switch (n) { case 'b': ignorelabel++; break; case 'p': printonly++; break; case 'f': force++; break; default: usage(); } } if (argc - optind < 1) usage(); srandomdev(); /* Increase our data size to the max */ if (getrlimit(RLIMIT_DATA, &rl) == 0) { rl.rlim_cur = rl.rlim_max; if (setrlimit(RLIMIT_DATA, &rl) < 0) warn("can't get resource limit to max data size"); } else warn("can't get resource limit for data size"); for (n = optind; n < argc; n++) { if (argc - optind != 1) (void)puts(argv[n]); ex += fsirand(argv[n]); if (n < argc - 1) putchar('\n'); } exit(ex); } int fsirand(char *device) { struct ufs1_dinode *dp1; struct ufs2_dinode *dp2; caddr_t inodebuf; ssize_t ibufsize; struct fs *sblock; ino_t inumber; ufs2_daddr_t sblockloc, dblk; char sbuf[SBLOCKSIZE], sbuftmp[SBLOCKSIZE]; int i, devfd, n, cg; u_int32_t bsize = DEV_BSIZE; if ((devfd = open(device, printonly ? O_RDONLY : O_RDWR)) < 0) { warn("can't open %s", device); return (1); } dp1 = NULL; dp2 = NULL; /* Read in master superblock */ (void)memset(&sbuf, 0, sizeof(sbuf)); sblock = (struct fs *)&sbuf; for (i = 0; sblock_try[i] != -1; i++) { sblockloc = sblock_try[i]; if (lseek(devfd, sblockloc, SEEK_SET) == -1) { warn("can't seek to superblock (%jd) on %s", (intmax_t)sblockloc, device); return (1); } if ((n = read(devfd, (void *)sblock, SBLOCKSIZE))!=SBLOCKSIZE) { warnx("can't read superblock on %s: %s", device, (n < SBLOCKSIZE) ? "short read" : strerror(errno)); return (1); } if ((sblock->fs_magic == FS_UFS1_MAGIC || (sblock->fs_magic == FS_UFS2_MAGIC && sblock->fs_sblockloc == sblock_try[i])) && sblock->fs_bsize <= MAXBSIZE && sblock->fs_bsize >= (ssize_t)sizeof(struct fs)) break; } if (sblock_try[i] == -1) { fprintf(stderr, "Cannot find file system superblock\n"); return (1); } if (sblock->fs_magic == FS_UFS1_MAGIC && sblock->fs_old_inodefmt < FS_44INODEFMT) { warnx("file system format is too old, sorry"); return (1); } if (!force && !printonly && sblock->fs_clean != 1) { warnx("file system is not clean, fsck %s first", device); return (1); } /* Make sure backup superblocks are sane. */ sblock = (struct fs *)&sbuftmp; for (cg = 0; cg < (int)sblock->fs_ncg; cg++) { dblk = fsbtodb(sblock, cgsblock(sblock, cg)); if (lseek(devfd, (off_t)dblk * bsize, SEEK_SET) < 0) { warn("can't seek to %jd", (intmax_t)dblk * bsize); return (1); } else if ((n = write(devfd, (void *)sblock, SBLOCKSIZE)) != SBLOCKSIZE) { warn("can't read backup superblock %d on %s: %s", cg + 1, device, (n < SBLOCKSIZE) ? "short write" : strerror(errno)); return (1); } if (sblock->fs_magic != FS_UFS1_MAGIC && sblock->fs_magic != FS_UFS2_MAGIC) { warnx("bad magic number in backup superblock %d on %s", cg + 1, device); return (1); } if (sblock->fs_sbsize > SBLOCKSIZE) { warnx("size of backup superblock %d on %s is preposterous", cg + 1, device); return (1); } } sblock = (struct fs *)&sbuf; /* XXX - should really cap buffer at 512kb or so */ if (sblock->fs_magic == FS_UFS1_MAGIC) ibufsize = sizeof(struct ufs1_dinode) * sblock->fs_ipg; else ibufsize = sizeof(struct ufs2_dinode) * sblock->fs_ipg; if ((inodebuf = malloc(ibufsize)) == NULL) errx(1, "can't allocate memory for inode buffer"); if (printonly && (sblock->fs_id[0] || sblock->fs_id[1])) { if (sblock->fs_id[0]) (void)printf("%s was randomized on %s", device, ctime((void *)&(sblock->fs_id[0]))); (void)printf("fsid: %x %x\n", sblock->fs_id[0], sblock->fs_id[1]); } /* Randomize fs_id unless old 4.2BSD file system */ if (!printonly) { /* Randomize fs_id and write out new sblock and backups */ sblock->fs_id[0] = (u_int32_t)time(NULL); sblock->fs_id[1] = random(); if (lseek(devfd, sblockloc, SEEK_SET) == -1) { warn("can't seek to superblock (%jd) on %s", (intmax_t)sblockloc, device); return (1); } if ((n = write(devfd, (void *)sblock, SBLOCKSIZE)) != SBLOCKSIZE) { warn("can't write superblock on %s: %s", device, (n < SBLOCKSIZE) ? "short write" : strerror(errno)); return (1); } } /* For each cylinder group, randomize inodes and update backup sblock */ for (cg = 0, inumber = 0; cg < (int)sblock->fs_ncg; cg++) { /* Update superblock if appropriate */ if (!printonly) { dblk = fsbtodb(sblock, cgsblock(sblock, cg)); if (lseek(devfd, (off_t)dblk * bsize, SEEK_SET) < 0) { warn("can't seek to %jd", (intmax_t)dblk * bsize); return (1); } else if ((n = write(devfd, (void *)sblock, SBLOCKSIZE)) != SBLOCKSIZE) { warn("can't write backup superblock %d on %s: %s", cg + 1, device, (n < SBLOCKSIZE) ? "short write" : strerror(errno)); return (1); } } /* Read in inodes, then print or randomize generation nums */ dblk = fsbtodb(sblock, ino_to_fsba(sblock, inumber)); if (lseek(devfd, (off_t)dblk * bsize, SEEK_SET) < 0) { warn("can't seek to %jd", (intmax_t)dblk * bsize); return (1); } else if ((n = read(devfd, inodebuf, ibufsize)) != ibufsize) { warnx("can't read inodes: %s", (n < ibufsize) ? "short read" : strerror(errno)); return (1); } for (n = 0; n < (int)sblock->fs_ipg; n++, inumber++) { if (sblock->fs_magic == FS_UFS1_MAGIC) dp1 = &((struct ufs1_dinode *)inodebuf)[n]; else dp2 = &((struct ufs2_dinode *)inodebuf)[n]; - if (inumber >= ROOTINO) { + if (inumber >= UFS_ROOTINO) { if (printonly) (void)printf("ino %ju gen %08x\n", (uintmax_t)inumber, sblock->fs_magic == FS_UFS1_MAGIC ? dp1->di_gen : dp2->di_gen); else if (sblock->fs_magic == FS_UFS1_MAGIC) dp1->di_gen = random(); else dp2->di_gen = random(); } } /* Write out modified inodes */ if (!printonly) { if (lseek(devfd, (off_t)dblk * bsize, SEEK_SET) < 0) { warn("can't seek to %jd", (intmax_t)dblk * bsize); return (1); } else if ((n = write(devfd, inodebuf, ibufsize)) != ibufsize) { warnx("can't write inodes: %s", (n != ibufsize) ? "short write" : strerror(errno)); return (1); } } } (void)close(devfd); return(0); } static void usage(void) { (void)fprintf(stderr, "usage: fsirand [-b] [-f] [-p] special [special ...]\n"); exit(1); } Index: head/sbin/growfs/debug.c =================================================================== --- head/sbin/growfs/debug.c (revision 313779) +++ head/sbin/growfs/debug.c (revision 313780) @@ -1,842 +1,842 @@ /* * Copyright (c) 2000 Christoph Herrmann, Thomas-Henning von Kamptz * Copyright (c) 1980, 1989, 1993 The Regents of the University of California. * All rights reserved. * * This code is derived from software contributed to Berkeley by * Christoph Herrmann and Thomas-Henning von Kamptz, Munich and Frankfurt. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgment: * This product includes software developed by the University of * California, Berkeley and its contributors, as well as Christoph * Herrmann and Thomas-Henning von Kamptz. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $TSHeader: src/sbin/growfs/debug.c,v 1.3 2000/12/12 19:31:00 tomsoft Exp $ * */ #ifndef lint static const char rcsid[] = "$FreeBSD$"; #endif /* not lint */ #include #include #include #include #include #include #include "debug.h" #ifdef FS_DEBUG static FILE *dbg_log = NULL; static unsigned int indent = 0; /* * prototypes not done here, as they come with debug.h */ /* * Open the filehandle where all debug output has to go. */ void dbg_open(const char *fn) { if (strcmp(fn, "-") == 0) dbg_log = fopen("/dev/stdout", "a"); else dbg_log = fopen(fn, "a"); return; } /* * Close the filehandle where all debug output went to. */ void dbg_close(void) { if (dbg_log) { fclose(dbg_log); dbg_log = NULL; } return; } /* * Dump out a full file system block in hex. */ void dbg_dump_hex(struct fs *sb, const char *comment, unsigned char *mem) { int i, j, k; if (!dbg_log) return; fprintf(dbg_log, "===== START HEXDUMP =====\n"); fprintf(dbg_log, "# %d@%lx: %s\n", indent, (unsigned long)mem, comment); indent++; for (i = 0; i < sb->fs_bsize; i += 24) { for (j = 0; j < 3; j++) { for (k = 0; k < 8; k++) fprintf(dbg_log, "%02x ", *mem++); fprintf(dbg_log, " "); } fprintf(dbg_log, "\n"); } indent--; fprintf(dbg_log, "===== END HEXDUMP =====\n"); return; } /* * Dump the superblock. */ void dbg_dump_fs(struct fs *sb, const char *comment) { int j; if (!dbg_log) return; fprintf(dbg_log, "===== START SUPERBLOCK =====\n"); fprintf(dbg_log, "# %d@%lx: %s\n", indent, (unsigned long)sb, comment); indent++; fprintf(dbg_log, "sblkno int32_t 0x%08x\n", sb->fs_sblkno); fprintf(dbg_log, "cblkno int32_t 0x%08x\n", sb->fs_cblkno); fprintf(dbg_log, "iblkno int32_t 0x%08x\n", sb->fs_iblkno); fprintf(dbg_log, "dblkno int32_t 0x%08x\n", sb->fs_dblkno); fprintf(dbg_log, "old_cgoffset int32_t 0x%08x\n", sb->fs_old_cgoffset); fprintf(dbg_log, "old_cgmask int32_t 0x%08x\n", sb->fs_old_cgmask); fprintf(dbg_log, "old_time int32_t %10u\n", (unsigned int)sb->fs_old_time); fprintf(dbg_log, "old_size int32_t 0x%08x\n", sb->fs_old_size); fprintf(dbg_log, "old_dsize int32_t 0x%08x\n", sb->fs_old_dsize); fprintf(dbg_log, "ncg int32_t 0x%08x\n", sb->fs_ncg); fprintf(dbg_log, "bsize int32_t 0x%08x\n", sb->fs_bsize); fprintf(dbg_log, "fsize int32_t 0x%08x\n", sb->fs_fsize); fprintf(dbg_log, "frag int32_t 0x%08x\n", sb->fs_frag); fprintf(dbg_log, "minfree int32_t 0x%08x\n", sb->fs_minfree); fprintf(dbg_log, "old_rotdelay int32_t 0x%08x\n", sb->fs_old_rotdelay); fprintf(dbg_log, "old_rps int32_t 0x%08x\n", sb->fs_old_rps); fprintf(dbg_log, "bmask int32_t 0x%08x\n", sb->fs_bmask); fprintf(dbg_log, "fmask int32_t 0x%08x\n", sb->fs_fmask); fprintf(dbg_log, "bshift int32_t 0x%08x\n", sb->fs_bshift); fprintf(dbg_log, "fshift int32_t 0x%08x\n", sb->fs_fshift); fprintf(dbg_log, "maxcontig int32_t 0x%08x\n", sb->fs_maxcontig); fprintf(dbg_log, "maxbpg int32_t 0x%08x\n", sb->fs_maxbpg); fprintf(dbg_log, "fragshift int32_t 0x%08x\n", sb->fs_fragshift); fprintf(dbg_log, "fsbtodb int32_t 0x%08x\n", sb->fs_fsbtodb); fprintf(dbg_log, "sbsize int32_t 0x%08x\n", sb->fs_sbsize); fprintf(dbg_log, "spare1 int32_t[2] 0x%08x 0x%08x\n", sb->fs_spare1[0], sb->fs_spare1[1]); fprintf(dbg_log, "nindir int32_t 0x%08x\n", sb->fs_nindir); fprintf(dbg_log, "inopb int32_t 0x%08x\n", sb->fs_inopb); fprintf(dbg_log, "old_nspf int32_t 0x%08x\n", sb->fs_old_nspf); fprintf(dbg_log, "optim int32_t 0x%08x\n", sb->fs_optim); fprintf(dbg_log, "old_npsect int32_t 0x%08x\n", sb->fs_old_npsect); fprintf(dbg_log, "old_interleave int32_t 0x%08x\n", sb->fs_old_interleave); fprintf(dbg_log, "old_trackskew int32_t 0x%08x\n", sb->fs_old_trackskew); fprintf(dbg_log, "id int32_t[2] 0x%08x 0x%08x\n", sb->fs_id[0], sb->fs_id[1]); fprintf(dbg_log, "old_csaddr int32_t 0x%08x\n", sb->fs_old_csaddr); fprintf(dbg_log, "cssize int32_t 0x%08x\n", sb->fs_cssize); fprintf(dbg_log, "cgsize int32_t 0x%08x\n", sb->fs_cgsize); fprintf(dbg_log, "spare2 int32_t 0x%08x\n", sb->fs_spare2); fprintf(dbg_log, "old_nsect int32_t 0x%08x\n", sb->fs_old_nsect); fprintf(dbg_log, "old_spc int32_t 0x%08x\n", sb->fs_old_spc); fprintf(dbg_log, "old_ncyl int32_t 0x%08x\n", sb->fs_old_ncyl); fprintf(dbg_log, "old_cpg int32_t 0x%08x\n", sb->fs_old_cpg); fprintf(dbg_log, "ipg int32_t 0x%08x\n", sb->fs_ipg); fprintf(dbg_log, "fpg int32_t 0x%08x\n", sb->fs_fpg); dbg_dump_csum("internal old_cstotal", &sb->fs_old_cstotal); fprintf(dbg_log, "fmod int8_t 0x%02x\n", sb->fs_fmod); fprintf(dbg_log, "clean int8_t 0x%02x\n", sb->fs_clean); fprintf(dbg_log, "ronly int8_t 0x%02x\n", sb->fs_ronly); fprintf(dbg_log, "old_flags int8_t 0x%02x\n", sb->fs_old_flags); fprintf(dbg_log, "fsmnt u_char[MAXMNTLEN] \"%s\"\n", sb->fs_fsmnt); fprintf(dbg_log, "volname u_char[MAXVOLLEN] \"%s\"\n", sb->fs_volname); fprintf(dbg_log, "swuid u_int64_t 0x%08x%08x\n", ((unsigned int *)&(sb->fs_swuid))[1], ((unsigned int *)&(sb->fs_swuid))[0]); fprintf(dbg_log, "pad int32_t 0x%08x\n", sb->fs_pad); fprintf(dbg_log, "cgrotor int32_t 0x%08x\n", sb->fs_cgrotor); /* * struct csum[MAXCSBUFS] - is only maintained in memory */ /* fprintf(dbg_log, " int32_t\n", sb->*fs_maxcluster);*/ fprintf(dbg_log, "old_cpc int32_t 0x%08x\n", sb->fs_old_cpc); /* * int16_t fs_opostbl[16][8] - is dumped when used in dbg_dump_sptbl */ fprintf(dbg_log, "maxbsize int32_t 0x%08x\n", sb->fs_maxbsize); fprintf(dbg_log, "unrefs int64_t 0x%08jx\n", sb->fs_unrefs); fprintf(dbg_log, "sblockloc int64_t 0x%08x%08x\n", ((unsigned int *)&(sb->fs_sblockloc))[1], ((unsigned int *)&(sb->fs_sblockloc))[0]); dbg_dump_csum_total("internal cstotal", &sb->fs_cstotal); fprintf(dbg_log, "time ufs_time_t %10u\n", (unsigned int)sb->fs_time); fprintf(dbg_log, "size int64_t 0x%08x%08x\n", ((unsigned int *)&(sb->fs_size))[1], ((unsigned int *)&(sb->fs_size))[0]); fprintf(dbg_log, "dsize int64_t 0x%08x%08x\n", ((unsigned int *)&(sb->fs_dsize))[1], ((unsigned int *)&(sb->fs_dsize))[0]); fprintf(dbg_log, "csaddr ufs2_daddr_t 0x%08x%08x\n", ((unsigned int *)&(sb->fs_csaddr))[1], ((unsigned int *)&(sb->fs_csaddr))[0]); fprintf(dbg_log, "pendingblocks int64_t 0x%08x%08x\n", ((unsigned int *)&(sb->fs_pendingblocks))[1], ((unsigned int *)&(sb->fs_pendingblocks))[0]); fprintf(dbg_log, "pendinginodes int32_t 0x%08x\n", sb->fs_pendinginodes); for (j = 0; j < FSMAXSNAP; j++) { fprintf(dbg_log, "snapinum int32_t[%2d] 0x%08x\n", j, sb->fs_snapinum[j]); if (!sb->fs_snapinum[j]) { /* list is dense */ break; } } fprintf(dbg_log, "avgfilesize int32_t 0x%08x\n", sb->fs_avgfilesize); fprintf(dbg_log, "avgfpdir int32_t 0x%08x\n", sb->fs_avgfpdir); fprintf(dbg_log, "save_cgsize int32_t 0x%08x\n", sb->fs_save_cgsize); fprintf(dbg_log, "flags int32_t 0x%08x\n", sb->fs_flags); fprintf(dbg_log, "contigsumsize int32_t 0x%08x\n", sb->fs_contigsumsize); fprintf(dbg_log, "maxsymlinklen int32_t 0x%08x\n", sb->fs_maxsymlinklen); fprintf(dbg_log, "old_inodefmt int32_t 0x%08x\n", sb->fs_old_inodefmt); fprintf(dbg_log, "maxfilesize u_int64_t 0x%08x%08x\n", ((unsigned int *)&(sb->fs_maxfilesize))[1], ((unsigned int *)&(sb->fs_maxfilesize))[0]); fprintf(dbg_log, "qbmask int64_t 0x%08x%08x\n", ((unsigned int *)&(sb->fs_qbmask))[1], ((unsigned int *)&(sb->fs_qbmask))[0]); fprintf(dbg_log, "qfmask int64_t 0x%08x%08x\n", ((unsigned int *)&(sb->fs_qfmask))[1], ((unsigned int *)&(sb->fs_qfmask))[0]); fprintf(dbg_log, "state int32_t 0x%08x\n", sb->fs_state); fprintf(dbg_log, "old_postblformat int32_t 0x%08x\n", sb->fs_old_postblformat); fprintf(dbg_log, "old_nrpos int32_t 0x%08x\n", sb->fs_old_nrpos); fprintf(dbg_log, "spare5 int32_t[2] 0x%08x 0x%08x\n", sb->fs_spare5[0], sb->fs_spare5[1]); fprintf(dbg_log, "magic int32_t 0x%08x\n", sb->fs_magic); indent--; fprintf(dbg_log, "===== END SUPERBLOCK =====\n"); return; } /* * Dump a cylinder group. */ void dbg_dump_cg(const char *comment, struct cg *cgr) { int j; if (!dbg_log) return; fprintf(dbg_log, "===== START CYLINDER GROUP =====\n"); fprintf(dbg_log, "# %d@%lx: %s\n", indent, (unsigned long)cgr, comment); indent++; fprintf(dbg_log, "magic int32_t 0x%08x\n", cgr->cg_magic); fprintf(dbg_log, "old_time int32_t 0x%08x\n", cgr->cg_old_time); fprintf(dbg_log, "cgx int32_t 0x%08x\n", cgr->cg_cgx); fprintf(dbg_log, "old_ncyl int16_t 0x%04x\n", cgr->cg_old_ncyl); fprintf(dbg_log, "old_niblk int16_t 0x%04x\n", cgr->cg_old_niblk); fprintf(dbg_log, "ndblk int32_t 0x%08x\n", cgr->cg_ndblk); dbg_dump_csum("internal cs", &cgr->cg_cs); fprintf(dbg_log, "rotor int32_t 0x%08x\n", cgr->cg_rotor); fprintf(dbg_log, "frotor int32_t 0x%08x\n", cgr->cg_frotor); fprintf(dbg_log, "irotor int32_t 0x%08x\n", cgr->cg_irotor); for (j = 0; j < MAXFRAG; j++) { fprintf(dbg_log, "frsum int32_t[%d] 0x%08x\n", j, cgr->cg_frsum[j]); } fprintf(dbg_log, "old_btotoff int32_t 0x%08x\n", cgr->cg_old_btotoff); fprintf(dbg_log, "old_boff int32_t 0x%08x\n", cgr->cg_old_boff); fprintf(dbg_log, "iusedoff int32_t 0x%08x\n", cgr->cg_iusedoff); fprintf(dbg_log, "freeoff int32_t 0x%08x\n", cgr->cg_freeoff); fprintf(dbg_log, "nextfreeoff int32_t 0x%08x\n", cgr->cg_nextfreeoff); fprintf(dbg_log, "clustersumoff int32_t 0x%08x\n", cgr->cg_clustersumoff); fprintf(dbg_log, "clusteroff int32_t 0x%08x\n", cgr->cg_clusteroff); fprintf(dbg_log, "nclusterblks int32_t 0x%08x\n", cgr->cg_nclusterblks); fprintf(dbg_log, "niblk int32_t 0x%08x\n", cgr->cg_niblk); fprintf(dbg_log, "initediblk int32_t 0x%08x\n", cgr->cg_initediblk); fprintf(dbg_log, "unrefs int32_t 0x%08x\n", cgr->cg_unrefs); fprintf(dbg_log, "time ufs_time_t %10u\n", (unsigned int)cgr->cg_initediblk); indent--; fprintf(dbg_log, "===== END CYLINDER GROUP =====\n"); return; } /* * Dump a cylinder summary. */ void dbg_dump_csum(const char *comment, struct csum *cs) { if (!dbg_log) return; fprintf(dbg_log, "===== START CYLINDER SUMMARY =====\n"); fprintf(dbg_log, "# %d@%lx: %s\n", indent, (unsigned long)cs, comment); indent++; fprintf(dbg_log, "ndir int32_t 0x%08x\n", cs->cs_ndir); fprintf(dbg_log, "nbfree int32_t 0x%08x\n", cs->cs_nbfree); fprintf(dbg_log, "nifree int32_t 0x%08x\n", cs->cs_nifree); fprintf(dbg_log, "nffree int32_t 0x%08x\n", cs->cs_nffree); indent--; fprintf(dbg_log, "===== END CYLINDER SUMMARY =====\n"); return; } /* * Dump a cylinder summary. */ void dbg_dump_csum_total(const char *comment, struct csum_total *cs) { if (!dbg_log) return; fprintf(dbg_log, "===== START CYLINDER SUMMARY TOTAL =====\n"); fprintf(dbg_log, "# %d@%lx: %s\n", indent, (unsigned long)cs, comment); indent++; fprintf(dbg_log, "ndir int64_t 0x%08x%08x\n", ((unsigned int *)&(cs->cs_ndir))[1], ((unsigned int *)&(cs->cs_ndir))[0]); fprintf(dbg_log, "nbfree int64_t 0x%08x%08x\n", ((unsigned int *)&(cs->cs_nbfree))[1], ((unsigned int *)&(cs->cs_nbfree))[0]); fprintf(dbg_log, "nifree int64_t 0x%08x%08x\n", ((unsigned int *)&(cs->cs_nifree))[1], ((unsigned int *)&(cs->cs_nifree))[0]); fprintf(dbg_log, "nffree int64_t 0x%08x%08x\n", ((unsigned int *)&(cs->cs_nffree))[1], ((unsigned int *)&(cs->cs_nffree))[0]); fprintf(dbg_log, "numclusters int64_t 0x%08x%08x\n", ((unsigned int *)&(cs->cs_numclusters))[1], ((unsigned int *)&(cs->cs_numclusters))[0]); indent--; fprintf(dbg_log, "===== END CYLINDER SUMMARY TOTAL =====\n"); return; } /* * Dump the inode allocation map in one cylinder group. */ void dbg_dump_inmap(struct fs *sb, const char *comment, struct cg *cgr) { int j,k,l,e; unsigned char *cp; if (!dbg_log) return; fprintf(dbg_log, "===== START INODE ALLOCATION MAP =====\n"); fprintf(dbg_log, "# %d@%lx: %s\n", indent, (unsigned long)cgr, comment); indent++; cp = (unsigned char *)cg_inosused(cgr); e = sb->fs_ipg / 8; for (j = 0; j < e; j += 32) { fprintf(dbg_log, "%08x: ", j); for (k = 0; k < 32; k += 8) { if (j + k + 8 < e) { fprintf(dbg_log, "%02x%02x%02x%02x%02x%02x%02x%02x ", cp[0], cp[1], cp[2], cp[3], cp[4], cp[5], cp[6], cp[7]); } else { for (l = 0; (l < 8) && (j + k + l < e); l++) { fprintf(dbg_log, "%02x", cp[l]); } } cp += 8; } fprintf(dbg_log, "\n"); } indent--; fprintf(dbg_log, "===== END INODE ALLOCATION MAP =====\n"); return; } /* * Dump the fragment allocation map in one cylinder group. */ void dbg_dump_frmap(struct fs *sb, const char *comment, struct cg *cgr) { int j,k,l,e; unsigned char *cp; if (!dbg_log) return; fprintf(dbg_log, "===== START FRAGMENT ALLOCATION MAP =====\n"); fprintf(dbg_log, "# %d@%lx: %s\n", indent, (unsigned long)cgr, comment); indent++; cp = (unsigned char *)cg_blksfree(cgr); if (sb->fs_old_nspf) e = howmany(sb->fs_old_cpg * sb->fs_old_spc / sb->fs_old_nspf, CHAR_BIT); else e = 0; for (j = 0; j < e; j += 32) { fprintf(dbg_log, "%08x: ", j); for (k = 0; k < 32; k += 8) { if (j + k + 8 fs_old_nspf) e = howmany(sb->fs_old_cpg * sb->fs_old_spc / (sb->fs_old_nspf << sb->fs_fragshift), CHAR_BIT); else e = 0; for (j = 0; j < e; j += 32) { fprintf(dbg_log, "%08x: ", j); for (k = 0; k < 32; k += 8) { if (j + k + 8 < e) { fprintf(dbg_log, "%02x%02x%02x%02x%02x%02x%02x%02x ", cp[0], cp[1], cp[2], cp[3], cp[4], cp[5], cp[6], cp[7]); } else { for (l = 0; (l < 8) && (j + k + l fs_contigsumsize; j++) { fprintf(dbg_log, "%02d: %8d\n", j, *ip++); } indent--; fprintf(dbg_log, "===== END CLUSTER SUMMARY =====\n"); return; } #ifdef NOT_CURRENTLY /* * This code dates from before the UFS2 integration, and doesn't compile * post-UFS2 due to the use of cg_blks(). I'm not sure how best to update * this for UFS2, where the rotational bits of UFS no longer apply, so * will leave it disabled for now; it should probably be re-enabled * specifically for UFS1. */ /* * Dump the block summary, and the rotational layout table. */ void dbg_dump_sptbl(struct fs *sb, const char *comment, struct cg *cgr) { int j,k; int *ip; if (!dbg_log) return; fprintf(dbg_log, "===== START BLOCK SUMMARY AND POSITION TABLE =====\n"); fprintf(dbg_log, "# %d@%lx: %s\n", indent, (unsigned long)cgr, comment); indent++; ip = (int *)cg_blktot(cgr); for (j = 0; j < sb->fs_old_cpg; j++) { fprintf(dbg_log, "%2d: %5d = ", j, *ip++); for (k = 0; k < sb->fs_old_nrpos; k++) { fprintf(dbg_log, "%4d", cg_blks(sb, cgr, j)[k]); if (k < sb->fs_old_nrpos - 1) fprintf(dbg_log, " + "); } fprintf(dbg_log, "\n"); } indent--; fprintf(dbg_log, "===== END BLOCK SUMMARY AND POSITION TABLE =====\n"); return; } #endif /* * Dump a UFS1 inode structure. */ void dbg_dump_ufs1_ino(struct fs *sb, const char *comment, struct ufs1_dinode *ino) { int ictr; int remaining_blocks; if (!dbg_log) return; fprintf(dbg_log, "===== START UFS1 INODE DUMP =====\n"); fprintf(dbg_log, "# %d@%lx: %s\n", indent, (unsigned long)ino, comment); indent++; fprintf(dbg_log, "mode u_int16_t 0%o\n", ino->di_mode); fprintf(dbg_log, "nlink int16_t 0x%04x\n", ino->di_nlink); fprintf(dbg_log, "size u_int64_t 0x%08x%08x\n", ((unsigned int *)&(ino->di_size))[1], ((unsigned int *)&(ino->di_size))[0]); fprintf(dbg_log, "atime int32_t 0x%08x\n", ino->di_atime); fprintf(dbg_log, "atimensec int32_t 0x%08x\n", ino->di_atimensec); fprintf(dbg_log, "mtime int32_t 0x%08x\n", ino->di_mtime); fprintf(dbg_log, "mtimensec int32_t 0x%08x\n", ino->di_mtimensec); fprintf(dbg_log, "ctime int32_t 0x%08x\n", ino->di_ctime); fprintf(dbg_log, "ctimensec int32_t 0x%08x\n", ino->di_ctimensec); remaining_blocks = howmany(ino->di_size, sb->fs_bsize); /* XXX ts - +1? */ - for (ictr = 0; ictr < MIN(NDADDR, remaining_blocks); ictr++) { + for (ictr = 0; ictr < MIN(UFS_NDADDR, remaining_blocks); ictr++) { fprintf(dbg_log, "db ufs_daddr_t[%x] 0x%08x\n", ictr, ino->di_db[ictr]); } - remaining_blocks -= NDADDR; + remaining_blocks -= UFS_NDADDR; if (remaining_blocks > 0) { fprintf(dbg_log, "ib ufs_daddr_t[0] 0x%08x\n", ino->di_ib[0]); } remaining_blocks -= howmany(sb->fs_bsize, sizeof(ufs1_daddr_t)); if (remaining_blocks > 0) { fprintf(dbg_log, "ib ufs_daddr_t[1] 0x%08x\n", ino->di_ib[1]); } #define SQUARE(a) ((a) * (a)) remaining_blocks -= SQUARE(howmany(sb->fs_bsize, sizeof(ufs1_daddr_t))); #undef SQUARE if (remaining_blocks > 0) { fprintf(dbg_log, "ib ufs_daddr_t[2] 0x%08x\n", ino->di_ib[2]); } fprintf(dbg_log, "flags u_int32_t 0x%08x\n", ino->di_flags); fprintf(dbg_log, "blocks int32_t 0x%08x\n", ino->di_blocks); fprintf(dbg_log, "gen int32_t 0x%08x\n", ino->di_gen); fprintf(dbg_log, "uid u_int32_t 0x%08x\n", ino->di_uid); fprintf(dbg_log, "gid u_int32_t 0x%08x\n", ino->di_gid); indent--; fprintf(dbg_log, "===== END UFS1 INODE DUMP =====\n"); return; } /* * Dump a UFS2 inode structure. */ void dbg_dump_ufs2_ino(struct fs *sb, const char *comment, struct ufs2_dinode *ino) { int ictr; int remaining_blocks; if (!dbg_log) return; fprintf(dbg_log, "===== START UFS2 INODE DUMP =====\n"); fprintf(dbg_log, "# %d@%lx: %s\n", indent, (unsigned long)ino, comment); indent++; fprintf(dbg_log, "mode u_int16_t 0%o\n", ino->di_mode); fprintf(dbg_log, "nlink int16_t 0x%04x\n", ino->di_nlink); fprintf(dbg_log, "uid u_int32_t 0x%08x\n", ino->di_uid); fprintf(dbg_log, "gid u_int32_t 0x%08x\n", ino->di_gid); fprintf(dbg_log, "blksize u_int32_t 0x%08x\n", ino->di_blksize); fprintf(dbg_log, "size u_int64_t 0x%08x%08x\n", ((unsigned int *)&(ino->di_size))[1], ((unsigned int *)&(ino->di_size))[0]); fprintf(dbg_log, "blocks u_int64_t 0x%08x%08x\n", ((unsigned int *)&(ino->di_blocks))[1], ((unsigned int *)&(ino->di_blocks))[0]); fprintf(dbg_log, "atime ufs_time_t %10jd\n", ino->di_atime); fprintf(dbg_log, "mtime ufs_time_t %10jd\n", ino->di_mtime); fprintf(dbg_log, "ctime ufs_time_t %10jd\n", ino->di_ctime); fprintf(dbg_log, "birthtime ufs_time_t %10jd\n", ino->di_birthtime); fprintf(dbg_log, "mtimensec int32_t 0x%08x\n", ino->di_mtimensec); fprintf(dbg_log, "atimensec int32_t 0x%08x\n", ino->di_atimensec); fprintf(dbg_log, "ctimensec int32_t 0x%08x\n", ino->di_ctimensec); fprintf(dbg_log, "birthnsec int32_t 0x%08x\n", ino->di_birthnsec); fprintf(dbg_log, "gen int32_t 0x%08x\n", ino->di_gen); fprintf(dbg_log, "kernflags u_int32_t 0x%08x\n", ino->di_kernflags); fprintf(dbg_log, "flags u_int32_t 0x%08x\n", ino->di_flags); fprintf(dbg_log, "extsize u_int32_t 0x%08x\n", ino->di_extsize); - /* XXX: What do we do with di_extb[NXADDR]? */ + /* XXX: What do we do with di_extb[UFS_NXADDR]? */ remaining_blocks = howmany(ino->di_size, sb->fs_bsize); /* XXX ts - +1? */ - for (ictr = 0; ictr < MIN(NDADDR, remaining_blocks); ictr++) { + for (ictr = 0; ictr < MIN(UFS_NDADDR, remaining_blocks); ictr++) { fprintf(dbg_log, "db ufs2_daddr_t[%x] 0x%16jx\n", ictr, ino->di_db[ictr]); } - remaining_blocks -= NDADDR; + remaining_blocks -= UFS_NDADDR; if (remaining_blocks > 0) { fprintf(dbg_log, "ib ufs2_daddr_t[0] 0x%16jx\n", ino->di_ib[0]); } remaining_blocks -= howmany(sb->fs_bsize, sizeof(ufs2_daddr_t)); if (remaining_blocks > 0) { fprintf(dbg_log, "ib ufs2_daddr_t[1] 0x%16jx\n", ino->di_ib[1]); } #define SQUARE(a) ((a) * (a)) remaining_blocks -= SQUARE(howmany(sb->fs_bsize, sizeof(ufs2_daddr_t))); #undef SQUARE if (remaining_blocks > 0) { fprintf(dbg_log, "ib ufs2_daddr_t[2] 0x%16jx\n", ino->di_ib[2]); } indent--; fprintf(dbg_log, "===== END UFS2 INODE DUMP =====\n"); return; } /* * Dump an indirect block. The iteration to dump a full file has to be * written around. */ void dbg_dump_iblk(struct fs *sb, const char *comment, char *block, size_t length) { unsigned int *mem, i, j, size; if (!dbg_log) return; fprintf(dbg_log, "===== START INDIRECT BLOCK DUMP =====\n"); fprintf(dbg_log, "# %d@%lx: %s\n", indent, (unsigned long)block, comment); indent++; if (sb->fs_magic == FS_UFS1_MAGIC) size = sizeof(ufs1_daddr_t); else size = sizeof(ufs2_daddr_t); mem = (unsigned int *)block; for (i = 0; (size_t)i < MIN(howmany(sb->fs_bsize, size), length); i += 8) { fprintf(dbg_log, "%04x: ", i); for (j = 0; j < 8; j++) { if ((size_t)(i + j) < length) fprintf(dbg_log, "%08X ", *mem++); } fprintf(dbg_log, "\n"); } indent--; fprintf(dbg_log, "===== END INDIRECT BLOCK DUMP =====\n"); return; } #endif /* FS_DEBUG */ Index: head/sbin/growfs/growfs.c =================================================================== --- head/sbin/growfs/growfs.c (revision 313779) +++ head/sbin/growfs/growfs.c (revision 313780) @@ -1,1741 +1,1741 @@ /* * Copyright (c) 1980, 1989, 1993 The Regents of the University of California. * Copyright (c) 2000 Christoph Herrmann, Thomas-Henning von Kamptz * Copyright (c) 2012 The FreeBSD Foundation * All rights reserved. * * This code is derived from software contributed to Berkeley by * Christoph Herrmann and Thomas-Henning von Kamptz, Munich and Frankfurt. * * Portions of this software were developed by Edward Tomasz Napierala * under sponsorship from the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgment: * This product includes software developed by the University of * California, Berkeley and its contributors, as well as Christoph * Herrmann and Thomas-Henning von Kamptz. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $TSHeader: src/sbin/growfs/growfs.c,v 1.5 2000/12/12 19:31:00 tomsoft Exp $ * */ #ifndef lint static const char copyright[] = "@(#) Copyright (c) 2000 Christoph Herrmann, Thomas-Henning von Kamptz\n\ Copyright (c) 1980, 1989, 1993 The Regents of the University of California.\n\ All rights reserved.\n"; #endif /* not lint */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "debug.h" #ifdef FS_DEBUG int _dbg_lvl_ = (DL_INFO); /* DL_TRC */ #endif /* FS_DEBUG */ static union { struct fs fs; char pad[SBLOCKSIZE]; } fsun1, fsun2; #define sblock fsun1.fs /* the new superblock */ #define osblock fsun2.fs /* the old superblock */ /* * Possible superblock locations ordered from most to least likely. */ static int sblock_try[] = SBLOCKSEARCH; static ufs2_daddr_t sblockloc; static union { struct cg cg; char pad[MAXBSIZE]; } cgun1, cgun2; #define acg cgun1.cg /* a cylinder cgroup (new) */ #define aocg cgun2.cg /* an old cylinder group */ static struct csum *fscs; /* cylinder summary */ static void growfs(int, int, unsigned int); static void rdfs(ufs2_daddr_t, size_t, void *, int); static void wtfs(ufs2_daddr_t, size_t, void *, int, unsigned int); static int charsperline(void); static void usage(void); static int isblock(struct fs *, unsigned char *, int); static void clrblock(struct fs *, unsigned char *, int); static void setblock(struct fs *, unsigned char *, int); static void initcg(int, time_t, int, unsigned int); static void updjcg(int, time_t, int, int, unsigned int); static void updcsloc(time_t, int, int, unsigned int); static void frag_adjust(ufs2_daddr_t, int); static void updclst(int); static void mount_reload(const struct statfs *stfs); /* * Here we actually start growing the file system. We basically read the * cylinder summary from the first cylinder group as we want to update * this on the fly during our various operations. First we handle the * changes in the former last cylinder group. Afterwards we create all new * cylinder groups. Now we handle the cylinder group containing the * cylinder summary which might result in a relocation of the whole * structure. In the end we write back the updated cylinder summary, the * new superblock, and slightly patched versions of the super block * copies. */ static void growfs(int fsi, int fso, unsigned int Nflag) { DBG_FUNC("growfs") time_t modtime; uint cylno; int i, j, width; char tmpbuf[100]; DBG_ENTER; time(&modtime); /* * Get the cylinder summary into the memory. */ fscs = (struct csum *)calloc((size_t)1, (size_t)sblock.fs_cssize); if (fscs == NULL) errx(1, "calloc failed"); for (i = 0; i < osblock.fs_cssize; i += osblock.fs_bsize) { rdfs(fsbtodb(&osblock, osblock.fs_csaddr + numfrags(&osblock, i)), (size_t)MIN(osblock.fs_cssize - i, osblock.fs_bsize), (void *)(((char *)fscs) + i), fsi); } #ifdef FS_DEBUG { struct csum *dbg_csp; u_int32_t dbg_csc; char dbg_line[80]; dbg_csp = fscs; for (dbg_csc = 0; dbg_csc < osblock.fs_ncg; dbg_csc++) { snprintf(dbg_line, sizeof(dbg_line), "%d. old csum in old location", dbg_csc); DBG_DUMP_CSUM(&osblock, dbg_line, dbg_csp++); } } #endif /* FS_DEBUG */ DBG_PRINT0("fscs read\n"); /* * Do all needed changes in the former last cylinder group. */ updjcg(osblock.fs_ncg - 1, modtime, fsi, fso, Nflag); /* * Dump out summary information about file system. */ #ifdef FS_DEBUG #define B2MBFACTOR (1 / (1024.0 * 1024.0)) printf("growfs: %.1fMB (%jd sectors) block size %d, fragment size %d\n", (float)sblock.fs_size * sblock.fs_fsize * B2MBFACTOR, (intmax_t)fsbtodb(&sblock, sblock.fs_size), sblock.fs_bsize, sblock.fs_fsize); printf("\tusing %d cylinder groups of %.2fMB, %d blks, %d inodes.\n", sblock.fs_ncg, (float)sblock.fs_fpg * sblock.fs_fsize * B2MBFACTOR, sblock.fs_fpg / sblock.fs_frag, sblock.fs_ipg); if (sblock.fs_flags & FS_DOSOFTDEP) printf("\twith soft updates\n"); #undef B2MBFACTOR #endif /* FS_DEBUG */ /* * Now build the cylinders group blocks and * then print out indices of cylinder groups. */ printf("super-block backups (for fsck_ffs -b #) at:\n"); i = 0; width = charsperline(); /* * Iterate for only the new cylinder groups. */ for (cylno = osblock.fs_ncg; cylno < sblock.fs_ncg; cylno++) { initcg(cylno, modtime, fso, Nflag); j = sprintf(tmpbuf, " %jd%s", (intmax_t)fsbtodb(&sblock, cgsblock(&sblock, cylno)), cylno < (sblock.fs_ncg - 1) ? "," : "" ); if (i + j >= width) { printf("\n"); i = 0; } i += j; printf("%s", tmpbuf); fflush(stdout); } printf("\n"); /* * Do all needed changes in the first cylinder group. * allocate blocks in new location */ updcsloc(modtime, fsi, fso, Nflag); /* * Now write the cylinder summary back to disk. */ for (i = 0; i < sblock.fs_cssize; i += sblock.fs_bsize) { wtfs(fsbtodb(&sblock, sblock.fs_csaddr + numfrags(&sblock, i)), (size_t)MIN(sblock.fs_cssize - i, sblock.fs_bsize), (void *)(((char *)fscs) + i), fso, Nflag); } DBG_PRINT0("fscs written\n"); #ifdef FS_DEBUG { struct csum *dbg_csp; u_int32_t dbg_csc; char dbg_line[80]; dbg_csp = fscs; for (dbg_csc = 0; dbg_csc < sblock.fs_ncg; dbg_csc++) { snprintf(dbg_line, sizeof(dbg_line), "%d. new csum in new location", dbg_csc); DBG_DUMP_CSUM(&sblock, dbg_line, dbg_csp++); } } #endif /* FS_DEBUG */ /* * Now write the new superblock back to disk. */ sblock.fs_time = modtime; wtfs(sblockloc, (size_t)SBLOCKSIZE, (void *)&sblock, fso, Nflag); DBG_PRINT0("sblock written\n"); DBG_DUMP_FS(&sblock, "new initial sblock"); /* * Clean up the dynamic fields in our superblock copies. */ sblock.fs_fmod = 0; sblock.fs_clean = 1; sblock.fs_ronly = 0; sblock.fs_cgrotor = 0; sblock.fs_state = 0; memset((void *)&sblock.fs_fsmnt, 0, sizeof(sblock.fs_fsmnt)); sblock.fs_flags &= FS_DOSOFTDEP; /* * XXX * The following fields are currently distributed from the superblock * to the copies: * fs_minfree * fs_rotdelay * fs_maxcontig * fs_maxbpg * fs_minfree, * fs_optim * fs_flags regarding SOFTPDATES * * We probably should rather change the summary for the cylinder group * statistics here to the value of what would be in there, if the file * system were created initially with the new size. Therefor we still * need to find an easy way of calculating that. * Possibly we can try to read the first superblock copy and apply the * "diffed" stats between the old and new superblock by still copying * certain parameters onto that. */ /* * Write out the duplicate super blocks. */ for (cylno = 0; cylno < sblock.fs_ncg; cylno++) { wtfs(fsbtodb(&sblock, cgsblock(&sblock, cylno)), (size_t)SBLOCKSIZE, (void *)&sblock, fso, Nflag); } DBG_PRINT0("sblock copies written\n"); DBG_DUMP_FS(&sblock, "new other sblocks"); DBG_LEAVE; return; } /* * This creates a new cylinder group structure, for more details please see * the source of newfs(8), as this function is taken over almost unchanged. * As this is never called for the first cylinder group, the special * provisions for that case are removed here. */ static void initcg(int cylno, time_t modtime, int fso, unsigned int Nflag) { DBG_FUNC("initcg") static caddr_t iobuf; long blkno, start; ino_t ino; ufs2_daddr_t i, cbase, dmax; struct ufs1_dinode *dp1; struct csum *cs; uint j, d, dupper, dlower; if (iobuf == NULL && (iobuf = malloc(sblock.fs_bsize * 3)) == NULL) errx(37, "panic: cannot allocate I/O buffer"); /* * Determine block bounds for cylinder group. * Allow space for super block summary information in first * cylinder group. */ cbase = cgbase(&sblock, cylno); dmax = cbase + sblock.fs_fpg; if (dmax > sblock.fs_size) dmax = sblock.fs_size; dlower = cgsblock(&sblock, cylno) - cbase; dupper = cgdmin(&sblock, cylno) - cbase; if (cylno == 0) /* XXX fscs may be relocated */ dupper += howmany(sblock.fs_cssize, sblock.fs_fsize); cs = &fscs[cylno]; memset(&acg, 0, sblock.fs_cgsize); acg.cg_time = modtime; acg.cg_magic = CG_MAGIC; acg.cg_cgx = cylno; acg.cg_niblk = sblock.fs_ipg; acg.cg_initediblk = MIN(sblock.fs_ipg, 2 * INOPB(&sblock)); acg.cg_ndblk = dmax - cbase; if (sblock.fs_contigsumsize > 0) acg.cg_nclusterblks = acg.cg_ndblk / sblock.fs_frag; start = &acg.cg_space[0] - (u_char *)(&acg.cg_firstfield); if (sblock.fs_magic == FS_UFS2_MAGIC) { acg.cg_iusedoff = start; } else { acg.cg_old_ncyl = sblock.fs_old_cpg; acg.cg_old_time = acg.cg_time; acg.cg_time = 0; acg.cg_old_niblk = acg.cg_niblk; acg.cg_niblk = 0; acg.cg_initediblk = 0; acg.cg_old_btotoff = start; acg.cg_old_boff = acg.cg_old_btotoff + sblock.fs_old_cpg * sizeof(int32_t); acg.cg_iusedoff = acg.cg_old_boff + sblock.fs_old_cpg * sizeof(u_int16_t); } acg.cg_freeoff = acg.cg_iusedoff + howmany(sblock.fs_ipg, CHAR_BIT); acg.cg_nextfreeoff = acg.cg_freeoff + howmany(sblock.fs_fpg, CHAR_BIT); if (sblock.fs_contigsumsize > 0) { acg.cg_clustersumoff = roundup(acg.cg_nextfreeoff, sizeof(u_int32_t)); acg.cg_clustersumoff -= sizeof(u_int32_t); acg.cg_clusteroff = acg.cg_clustersumoff + (sblock.fs_contigsumsize + 1) * sizeof(u_int32_t); acg.cg_nextfreeoff = acg.cg_clusteroff + howmany(fragstoblks(&sblock, sblock.fs_fpg), CHAR_BIT); } if (acg.cg_nextfreeoff > (unsigned)sblock.fs_cgsize) { /* * This should never happen as we would have had that panic * already on file system creation */ errx(37, "panic: cylinder group too big"); } acg.cg_cs.cs_nifree += sblock.fs_ipg; if (cylno == 0) - for (ino = 0; ino < ROOTINO; ino++) { + for (ino = 0; ino < UFS_ROOTINO; ino++) { setbit(cg_inosused(&acg), ino); acg.cg_cs.cs_nifree--; } /* * For the old file system, we have to initialize all the inodes. */ if (sblock.fs_magic == FS_UFS1_MAGIC) { bzero(iobuf, sblock.fs_bsize); for (i = 0; i < sblock.fs_ipg / INOPF(&sblock); i += sblock.fs_frag) { dp1 = (struct ufs1_dinode *)(void *)iobuf; for (j = 0; j < INOPB(&sblock); j++) { dp1->di_gen = arc4random(); dp1++; } wtfs(fsbtodb(&sblock, cgimin(&sblock, cylno) + i), sblock.fs_bsize, iobuf, fso, Nflag); } } if (cylno > 0) { /* * In cylno 0, beginning space is reserved * for boot and super blocks. */ for (d = 0; d < dlower; d += sblock.fs_frag) { blkno = d / sblock.fs_frag; setblock(&sblock, cg_blksfree(&acg), blkno); if (sblock.fs_contigsumsize > 0) setbit(cg_clustersfree(&acg), blkno); acg.cg_cs.cs_nbfree++; } sblock.fs_dsize += dlower; } sblock.fs_dsize += acg.cg_ndblk - dupper; if ((i = dupper % sblock.fs_frag)) { acg.cg_frsum[sblock.fs_frag - i]++; for (d = dupper + sblock.fs_frag - i; dupper < d; dupper++) { setbit(cg_blksfree(&acg), dupper); acg.cg_cs.cs_nffree++; } } for (d = dupper; d + sblock.fs_frag <= acg.cg_ndblk; d += sblock.fs_frag) { blkno = d / sblock.fs_frag; setblock(&sblock, cg_blksfree(&acg), blkno); if (sblock.fs_contigsumsize > 0) setbit(cg_clustersfree(&acg), blkno); acg.cg_cs.cs_nbfree++; } if (d < acg.cg_ndblk) { acg.cg_frsum[acg.cg_ndblk - d]++; for (; d < acg.cg_ndblk; d++) { setbit(cg_blksfree(&acg), d); acg.cg_cs.cs_nffree++; } } if (sblock.fs_contigsumsize > 0) { int32_t *sump = cg_clustersum(&acg); u_char *mapp = cg_clustersfree(&acg); int map = *mapp++; int bit = 1; int run = 0; for (i = 0; i < acg.cg_nclusterblks; i++) { if ((map & bit) != 0) run++; else if (run != 0) { if (run > sblock.fs_contigsumsize) run = sblock.fs_contigsumsize; sump[run]++; run = 0; } if ((i & (CHAR_BIT - 1)) != CHAR_BIT - 1) bit <<= 1; else { map = *mapp++; bit = 1; } } if (run != 0) { if (run > sblock.fs_contigsumsize) run = sblock.fs_contigsumsize; sump[run]++; } } sblock.fs_cstotal.cs_ndir += acg.cg_cs.cs_ndir; sblock.fs_cstotal.cs_nffree += acg.cg_cs.cs_nffree; sblock.fs_cstotal.cs_nbfree += acg.cg_cs.cs_nbfree; sblock.fs_cstotal.cs_nifree += acg.cg_cs.cs_nifree; *cs = acg.cg_cs; memcpy(iobuf, &acg, sblock.fs_cgsize); memset(iobuf + sblock.fs_cgsize, '\0', sblock.fs_bsize * 3 - sblock.fs_cgsize); wtfs(fsbtodb(&sblock, cgtod(&sblock, cylno)), sblock.fs_bsize * 3, iobuf, fso, Nflag); DBG_DUMP_CG(&sblock, "new cg", &acg); DBG_LEAVE; return; } /* * Here we add or subtract (sign +1/-1) the available fragments in a given * block to or from the fragment statistics. By subtracting before and adding * after an operation on the free frag map we can easy update the fragment * statistic, which seems to be otherwise a rather complex operation. */ static void frag_adjust(ufs2_daddr_t frag, int sign) { DBG_FUNC("frag_adjust") int fragsize; int f; DBG_ENTER; fragsize = 0; /* * Here frag only needs to point to any fragment in the block we want * to examine. */ for (f = rounddown(frag, sblock.fs_frag); f < roundup(frag + 1, sblock.fs_frag); f++) { /* * Count contiguous free fragments. */ if (isset(cg_blksfree(&acg), f)) { fragsize++; } else { if (fragsize && fragsize < sblock.fs_frag) { /* * We found something in between. */ acg.cg_frsum[fragsize] += sign; DBG_PRINT2("frag_adjust [%d]+=%d\n", fragsize, sign); } fragsize = 0; } } if (fragsize && fragsize < sblock.fs_frag) { /* * We found something. */ acg.cg_frsum[fragsize] += sign; DBG_PRINT2("frag_adjust [%d]+=%d\n", fragsize, sign); } DBG_PRINT2("frag_adjust [[%d]]+=%d\n", fragsize, sign); DBG_LEAVE; return; } /* * Here we do all needed work for the former last cylinder group. It has to be * changed in any case, even if the file system ended exactly on the end of * this group, as there is some slightly inconsistent handling of the number * of cylinders in the cylinder group. We start again by reading the cylinder * group from disk. If the last block was not fully available, we first handle * the missing fragments, then we handle all new full blocks in that file * system and finally we handle the new last fragmented block in the file * system. We again have to handle the fragment statistics rotational layout * tables and cluster summary during all those operations. */ static void updjcg(int cylno, time_t modtime, int fsi, int fso, unsigned int Nflag) { DBG_FUNC("updjcg") ufs2_daddr_t cbase, dmax, dupper; struct csum *cs; int i, k; int j = 0; DBG_ENTER; /* * Read the former last (joining) cylinder group from disk, and make * a copy. */ rdfs(fsbtodb(&osblock, cgtod(&osblock, cylno)), (size_t)osblock.fs_cgsize, (void *)&aocg, fsi); DBG_PRINT0("jcg read\n"); DBG_DUMP_CG(&sblock, "old joining cg", &aocg); memcpy((void *)&cgun1, (void *)&cgun2, sizeof(cgun2)); /* * If the cylinder group had already its new final size almost * nothing is to be done ... except: * For some reason the value of cg_ncyl in the last cylinder group has * to be zero instead of fs_cpg. As this is now no longer the last * cylinder group we have to change that value now to fs_cpg. */ if (cgbase(&osblock, cylno + 1) == osblock.fs_size) { if (sblock.fs_magic == FS_UFS1_MAGIC) acg.cg_old_ncyl = sblock.fs_old_cpg; wtfs(fsbtodb(&sblock, cgtod(&sblock, cylno)), (size_t)sblock.fs_cgsize, (void *)&acg, fso, Nflag); DBG_PRINT0("jcg written\n"); DBG_DUMP_CG(&sblock, "new joining cg", &acg); DBG_LEAVE; return; } /* * Set up some variables needed later. */ cbase = cgbase(&sblock, cylno); dmax = cbase + sblock.fs_fpg; if (dmax > sblock.fs_size) dmax = sblock.fs_size; dupper = cgdmin(&sblock, cylno) - cbase; if (cylno == 0) /* XXX fscs may be relocated */ dupper += howmany(sblock.fs_cssize, sblock.fs_fsize); /* * Set pointer to the cylinder summary for our cylinder group. */ cs = fscs + cylno; /* * Touch the cylinder group, update all fields in the cylinder group as * needed, update the free space in the superblock. */ acg.cg_time = modtime; if ((unsigned)cylno == sblock.fs_ncg - 1) { /* * This is still the last cylinder group. */ if (sblock.fs_magic == FS_UFS1_MAGIC) acg.cg_old_ncyl = sblock.fs_old_ncyl % sblock.fs_old_cpg; } else { acg.cg_old_ncyl = sblock.fs_old_cpg; } DBG_PRINT2("jcg dbg: %d %u", cylno, sblock.fs_ncg); #ifdef FS_DEBUG if (sblock.fs_magic == FS_UFS1_MAGIC) DBG_PRINT2("%d %u", acg.cg_old_ncyl, sblock.fs_old_cpg); #endif DBG_PRINT0("\n"); acg.cg_ndblk = dmax - cbase; sblock.fs_dsize += acg.cg_ndblk - aocg.cg_ndblk; if (sblock.fs_contigsumsize > 0) acg.cg_nclusterblks = acg.cg_ndblk / sblock.fs_frag; /* * Now we have to update the free fragment bitmap for our new free * space. There again we have to handle the fragmentation and also * the rotational layout tables and the cluster summary. This is * also done per fragment for the first new block if the old file * system end was not on a block boundary, per fragment for the new * last block if the new file system end is not on a block boundary, * and per block for all space in between. * * Handle the first new block here if it was partially available * before. */ if (osblock.fs_size % sblock.fs_frag) { if (roundup(osblock.fs_size, sblock.fs_frag) <= sblock.fs_size) { /* * The new space is enough to fill at least this * block */ j = 0; for (i = roundup(osblock.fs_size - cbase, sblock.fs_frag) - 1; i >= osblock.fs_size - cbase; i--) { setbit(cg_blksfree(&acg), i); acg.cg_cs.cs_nffree++; j++; } /* * Check if the fragment just created could join an * already existing fragment at the former end of the * file system. */ if (isblock(&sblock, cg_blksfree(&acg), ((osblock.fs_size - cgbase(&sblock, cylno)) / sblock.fs_frag))) { /* * The block is now completely available. */ DBG_PRINT0("block was\n"); acg.cg_frsum[osblock.fs_size % sblock.fs_frag]--; acg.cg_cs.cs_nbfree++; acg.cg_cs.cs_nffree -= sblock.fs_frag; k = rounddown(osblock.fs_size - cbase, sblock.fs_frag); updclst((osblock.fs_size - cbase) / sblock.fs_frag); } else { /* * Lets rejoin a possible partially growed * fragment. */ k = 0; while (isset(cg_blksfree(&acg), i) && (i >= rounddown(osblock.fs_size - cbase, sblock.fs_frag))) { i--; k++; } if (k) acg.cg_frsum[k]--; acg.cg_frsum[k + j]++; } } else { /* * We only grow by some fragments within this last * block. */ for (i = sblock.fs_size - cbase - 1; i >= osblock.fs_size - cbase; i--) { setbit(cg_blksfree(&acg), i); acg.cg_cs.cs_nffree++; j++; } /* * Lets rejoin a possible partially growed fragment. */ k = 0; while (isset(cg_blksfree(&acg), i) && (i >= rounddown(osblock.fs_size - cbase, sblock.fs_frag))) { i--; k++; } if (k) acg.cg_frsum[k]--; acg.cg_frsum[k + j]++; } } /* * Handle all new complete blocks here. */ for (i = roundup(osblock.fs_size - cbase, sblock.fs_frag); i + sblock.fs_frag <= dmax - cbase; /* XXX <= or only < ? */ i += sblock.fs_frag) { j = i / sblock.fs_frag; setblock(&sblock, cg_blksfree(&acg), j); updclst(j); acg.cg_cs.cs_nbfree++; } /* * Handle the last new block if there are stll some new fragments left. * Here we don't have to bother about the cluster summary or the even * the rotational layout table. */ if (i < (dmax - cbase)) { acg.cg_frsum[dmax - cbase - i]++; for (; i < dmax - cbase; i++) { setbit(cg_blksfree(&acg), i); acg.cg_cs.cs_nffree++; } } sblock.fs_cstotal.cs_nffree += (acg.cg_cs.cs_nffree - aocg.cg_cs.cs_nffree); sblock.fs_cstotal.cs_nbfree += (acg.cg_cs.cs_nbfree - aocg.cg_cs.cs_nbfree); /* * The following statistics are not changed here: * sblock.fs_cstotal.cs_ndir * sblock.fs_cstotal.cs_nifree * As the statistics for this cylinder group are ready, copy it to * the summary information array. */ *cs = acg.cg_cs; /* * Write the updated "joining" cylinder group back to disk. */ wtfs(fsbtodb(&sblock, cgtod(&sblock, cylno)), (size_t)sblock.fs_cgsize, (void *)&acg, fso, Nflag); DBG_PRINT0("jcg written\n"); DBG_DUMP_CG(&sblock, "new joining cg", &acg); DBG_LEAVE; return; } /* * Here we update the location of the cylinder summary. We have two possible * ways of growing the cylinder summary: * (1) We can try to grow the summary in the current location, and relocate * possibly used blocks within the current cylinder group. * (2) Alternatively we can relocate the whole cylinder summary to the first * new completely empty cylinder group. Once the cylinder summary is no * longer in the beginning of the first cylinder group you should never * use a version of fsck which is not aware of the possibility to have * this structure in a non standard place. * Option (2) is considered to be less intrusive to the structure of the file- * system, so that's the one being used. */ static void updcsloc(time_t modtime, int fsi, int fso, unsigned int Nflag) { DBG_FUNC("updcsloc") struct csum *cs; int ocscg, ncscg; ufs2_daddr_t d; int lcs = 0; int block; DBG_ENTER; if (howmany(sblock.fs_cssize, sblock.fs_fsize) == howmany(osblock.fs_cssize, osblock.fs_fsize)) { /* * No new fragment needed. */ DBG_LEAVE; return; } ocscg = dtog(&osblock, osblock.fs_csaddr); cs = fscs + ocscg; /* * Read original cylinder group from disk, and make a copy. * XXX If Nflag is set in some very rare cases we now miss * some changes done in updjcg by reading the unmodified * block from disk. */ rdfs(fsbtodb(&osblock, cgtod(&osblock, ocscg)), (size_t)osblock.fs_cgsize, (void *)&aocg, fsi); DBG_PRINT0("oscg read\n"); DBG_DUMP_CG(&sblock, "old summary cg", &aocg); memcpy((void *)&cgun1, (void *)&cgun2, sizeof(cgun2)); /* * Touch the cylinder group, set up local variables needed later * and update the superblock. */ acg.cg_time = modtime; /* * XXX In the case of having active snapshots we may need much more * blocks for the copy on write. We need each block twice, and * also up to 8*3 blocks for indirect blocks for all possible * references. */ /* * There is not enough space in the old cylinder group to * relocate all blocks as needed, so we relocate the whole * cylinder group summary to a new group. We try to use the * first complete new cylinder group just created. Within the * cylinder group we align the area immediately after the * cylinder group information location in order to be as * close as possible to the original implementation of ffs. * * First we have to make sure we'll find enough space in the * new cylinder group. If not, then we currently give up. * We start with freeing everything which was used by the * fragments of the old cylinder summary in the current group. * Now we write back the group meta data, read in the needed * meta data from the new cylinder group, and start allocating * within that group. Here we can assume, the group to be * completely empty. Which makes the handling of fragments and * clusters a lot easier. */ DBG_TRC; if (sblock.fs_ncg - osblock.fs_ncg < 2) errx(2, "panic: not enough space"); /* * Point "d" to the first fragment not used by the cylinder * summary. */ d = osblock.fs_csaddr + (osblock.fs_cssize / osblock.fs_fsize); /* * Set up last cluster size ("lcs") already here. Calculate * the size for the trailing cluster just behind where "d" * points to. */ if (sblock.fs_contigsumsize > 0) { for (block = howmany(d % sblock.fs_fpg, sblock.fs_frag), lcs = 0; lcs < sblock.fs_contigsumsize; block++, lcs++) { if (isclr(cg_clustersfree(&acg), block)) break; } } /* * Point "d" to the last frag used by the cylinder summary. */ d--; DBG_PRINT1("d=%jd\n", (intmax_t)d); if ((d + 1) % sblock.fs_frag) { /* * The end of the cylinder summary is not a complete * block. */ DBG_TRC; frag_adjust(d % sblock.fs_fpg, -1); for (; (d + 1) % sblock.fs_frag; d--) { DBG_PRINT1("d=%jd\n", (intmax_t)d); setbit(cg_blksfree(&acg), d % sblock.fs_fpg); acg.cg_cs.cs_nffree++; sblock.fs_cstotal.cs_nffree++; } /* * Point "d" to the last fragment of the last * (incomplete) block of the cylinder summary. */ d++; frag_adjust(d % sblock.fs_fpg, 1); if (isblock(&sblock, cg_blksfree(&acg), (d % sblock.fs_fpg) / sblock.fs_frag)) { DBG_PRINT1("d=%jd\n", (intmax_t)d); acg.cg_cs.cs_nffree -= sblock.fs_frag; acg.cg_cs.cs_nbfree++; sblock.fs_cstotal.cs_nffree -= sblock.fs_frag; sblock.fs_cstotal.cs_nbfree++; if (sblock.fs_contigsumsize > 0) { setbit(cg_clustersfree(&acg), (d % sblock.fs_fpg) / sblock.fs_frag); if (lcs < sblock.fs_contigsumsize) { if (lcs) cg_clustersum(&acg)[lcs]--; lcs++; cg_clustersum(&acg)[lcs]++; } } } /* * Point "d" to the first fragment of the block before * the last incomplete block. */ d--; } DBG_PRINT1("d=%jd\n", (intmax_t)d); for (d = rounddown(d, sblock.fs_frag); d >= osblock.fs_csaddr; d -= sblock.fs_frag) { DBG_TRC; DBG_PRINT1("d=%jd\n", (intmax_t)d); setblock(&sblock, cg_blksfree(&acg), (d % sblock.fs_fpg) / sblock.fs_frag); acg.cg_cs.cs_nbfree++; sblock.fs_cstotal.cs_nbfree++; if (sblock.fs_contigsumsize > 0) { setbit(cg_clustersfree(&acg), (d % sblock.fs_fpg) / sblock.fs_frag); /* * The last cluster size is already set up. */ if (lcs < sblock.fs_contigsumsize) { if (lcs) cg_clustersum(&acg)[lcs]--; lcs++; cg_clustersum(&acg)[lcs]++; } } } *cs = acg.cg_cs; /* * Now write the former cylinder group containing the cylinder * summary back to disk. */ wtfs(fsbtodb(&sblock, cgtod(&sblock, ocscg)), (size_t)sblock.fs_cgsize, (void *)&acg, fso, Nflag); DBG_PRINT0("oscg written\n"); DBG_DUMP_CG(&sblock, "old summary cg", &acg); /* * Find the beginning of the new cylinder group containing the * cylinder summary. */ sblock.fs_csaddr = cgdmin(&sblock, osblock.fs_ncg); ncscg = dtog(&sblock, sblock.fs_csaddr); cs = fscs + ncscg; /* * If Nflag is specified, we would now read random data instead * of an empty cg structure from disk. So we can't simulate that * part for now. */ if (Nflag) { DBG_PRINT0("nscg update skipped\n"); DBG_LEAVE; return; } /* * Read the future cylinder group containing the cylinder * summary from disk, and make a copy. */ rdfs(fsbtodb(&sblock, cgtod(&sblock, ncscg)), (size_t)sblock.fs_cgsize, (void *)&aocg, fsi); DBG_PRINT0("nscg read\n"); DBG_DUMP_CG(&sblock, "new summary cg", &aocg); memcpy((void *)&cgun1, (void *)&cgun2, sizeof(cgun2)); /* * Allocate all complete blocks used by the new cylinder * summary. */ for (d = sblock.fs_csaddr; d + sblock.fs_frag <= sblock.fs_csaddr + (sblock.fs_cssize / sblock.fs_fsize); d += sblock.fs_frag) { clrblock(&sblock, cg_blksfree(&acg), (d % sblock.fs_fpg) / sblock.fs_frag); acg.cg_cs.cs_nbfree--; sblock.fs_cstotal.cs_nbfree--; if (sblock.fs_contigsumsize > 0) { clrbit(cg_clustersfree(&acg), (d % sblock.fs_fpg) / sblock.fs_frag); } } /* * Allocate all fragments used by the cylinder summary in the * last block. */ if (d < sblock.fs_csaddr + (sblock.fs_cssize / sblock.fs_fsize)) { for (; d - sblock.fs_csaddr < sblock.fs_cssize/sblock.fs_fsize; d++) { clrbit(cg_blksfree(&acg), d % sblock.fs_fpg); acg.cg_cs.cs_nffree--; sblock.fs_cstotal.cs_nffree--; } acg.cg_cs.cs_nbfree--; acg.cg_cs.cs_nffree += sblock.fs_frag; sblock.fs_cstotal.cs_nbfree--; sblock.fs_cstotal.cs_nffree += sblock.fs_frag; if (sblock.fs_contigsumsize > 0) clrbit(cg_clustersfree(&acg), (d % sblock.fs_fpg) / sblock.fs_frag); frag_adjust(d % sblock.fs_fpg, 1); } /* * XXX Handle the cluster statistics here in the case this * cylinder group is now almost full, and the remaining * space is less then the maximum cluster size. This is * probably not needed, as you would hardly find a file * system which has only MAXCSBUFS+FS_MAXCONTIG of free * space right behind the cylinder group information in * any new cylinder group. */ /* * Update our statistics in the cylinder summary. */ *cs = acg.cg_cs; /* * Write the new cylinder group containing the cylinder summary * back to disk. */ wtfs(fsbtodb(&sblock, cgtod(&sblock, ncscg)), (size_t)sblock.fs_cgsize, (void *)&acg, fso, Nflag); DBG_PRINT0("nscg written\n"); DBG_DUMP_CG(&sblock, "new summary cg", &acg); DBG_LEAVE; return; } /* * Here we read some block(s) from disk. */ static void rdfs(ufs2_daddr_t bno, size_t size, void *bf, int fsi) { DBG_FUNC("rdfs") ssize_t n; DBG_ENTER; if (bno < 0) err(32, "rdfs: attempting to read negative block number"); if (lseek(fsi, (off_t)bno * DEV_BSIZE, 0) < 0) err(33, "rdfs: seek error: %jd", (intmax_t)bno); n = read(fsi, bf, size); if (n != (ssize_t)size) err(34, "rdfs: read error: %jd", (intmax_t)bno); DBG_LEAVE; return; } /* * Here we write some block(s) to disk. */ static void wtfs(ufs2_daddr_t bno, size_t size, void *bf, int fso, unsigned int Nflag) { DBG_FUNC("wtfs") ssize_t n; DBG_ENTER; if (Nflag) { DBG_LEAVE; return; } if (lseek(fso, (off_t)bno * DEV_BSIZE, SEEK_SET) < 0) err(35, "wtfs: seek error: %ld", (long)bno); n = write(fso, bf, size); if (n != (ssize_t)size) err(36, "wtfs: write error: %ld", (long)bno); DBG_LEAVE; return; } /* * Here we check if all frags of a block are free. For more details again * please see the source of newfs(8), as this function is taken over almost * unchanged. */ static int isblock(struct fs *fs, unsigned char *cp, int h) { DBG_FUNC("isblock") unsigned char mask; DBG_ENTER; switch (fs->fs_frag) { case 8: DBG_LEAVE; return (cp[h] == 0xff); case 4: mask = 0x0f << ((h & 0x1) << 2); DBG_LEAVE; return ((cp[h >> 1] & mask) == mask); case 2: mask = 0x03 << ((h & 0x3) << 1); DBG_LEAVE; return ((cp[h >> 2] & mask) == mask); case 1: mask = 0x01 << (h & 0x7); DBG_LEAVE; return ((cp[h >> 3] & mask) == mask); default: fprintf(stderr, "isblock bad fs_frag %d\n", fs->fs_frag); DBG_LEAVE; return (0); } } /* * Here we allocate a complete block in the block map. For more details again * please see the source of newfs(8), as this function is taken over almost * unchanged. */ static void clrblock(struct fs *fs, unsigned char *cp, int h) { DBG_FUNC("clrblock") DBG_ENTER; switch ((fs)->fs_frag) { case 8: cp[h] = 0; break; case 4: cp[h >> 1] &= ~(0x0f << ((h & 0x1) << 2)); break; case 2: cp[h >> 2] &= ~(0x03 << ((h & 0x3) << 1)); break; case 1: cp[h >> 3] &= ~(0x01 << (h & 0x7)); break; default: warnx("clrblock bad fs_frag %d", fs->fs_frag); break; } DBG_LEAVE; return; } /* * Here we free a complete block in the free block map. For more details again * please see the source of newfs(8), as this function is taken over almost * unchanged. */ static void setblock(struct fs *fs, unsigned char *cp, int h) { DBG_FUNC("setblock") DBG_ENTER; switch (fs->fs_frag) { case 8: cp[h] = 0xff; break; case 4: cp[h >> 1] |= (0x0f << ((h & 0x1) << 2)); break; case 2: cp[h >> 2] |= (0x03 << ((h & 0x3) << 1)); break; case 1: cp[h >> 3] |= (0x01 << (h & 0x7)); break; default: warnx("setblock bad fs_frag %d", fs->fs_frag); break; } DBG_LEAVE; return; } /* * Figure out how many lines our current terminal has. For more details again * please see the source of newfs(8), as this function is taken over almost * unchanged. */ static int charsperline(void) { DBG_FUNC("charsperline") int columns; char *cp; struct winsize ws; DBG_ENTER; columns = 0; if (ioctl(0, TIOCGWINSZ, &ws) != -1) columns = ws.ws_col; if (columns == 0 && (cp = getenv("COLUMNS"))) columns = atoi(cp); if (columns == 0) columns = 80; /* last resort */ DBG_LEAVE; return (columns); } static int is_dev(const char *name) { struct stat devstat; if (stat(name, &devstat) != 0) return (0); if (!S_ISCHR(devstat.st_mode)) return (0); return (1); } /* * Return mountpoint on which the device is currently mounted. */ static const struct statfs * dev_to_statfs(const char *dev) { struct stat devstat, mntdevstat; struct statfs *mntbuf, *statfsp; char device[MAXPATHLEN]; char *mntdevname; int i, mntsize; /* * First check the mounted filesystems. */ if (stat(dev, &devstat) != 0) return (NULL); if (!S_ISCHR(devstat.st_mode) && !S_ISBLK(devstat.st_mode)) return (NULL); mntsize = getmntinfo(&mntbuf, MNT_NOWAIT); for (i = 0; i < mntsize; i++) { statfsp = &mntbuf[i]; mntdevname = statfsp->f_mntfromname; if (*mntdevname != '/') { strcpy(device, _PATH_DEV); strcat(device, mntdevname); mntdevname = device; } if (stat(mntdevname, &mntdevstat) == 0 && mntdevstat.st_rdev == devstat.st_rdev) return (statfsp); } return (NULL); } static const char * mountpoint_to_dev(const char *mountpoint) { struct statfs *mntbuf, *statfsp; struct fstab *fs; int i, mntsize; /* * First check the mounted filesystems. */ mntsize = getmntinfo(&mntbuf, MNT_NOWAIT); for (i = 0; i < mntsize; i++) { statfsp = &mntbuf[i]; if (strcmp(statfsp->f_mntonname, mountpoint) == 0) return (statfsp->f_mntfromname); } /* * Check the fstab. */ fs = getfsfile(mountpoint); if (fs != NULL) return (fs->fs_spec); return (NULL); } static const char * getdev(const char *name) { static char device[MAXPATHLEN]; const char *cp, *dev; if (is_dev(name)) return (name); cp = strrchr(name, '/'); if (cp == NULL) { snprintf(device, sizeof(device), "%s%s", _PATH_DEV, name); if (is_dev(device)) return (device); } dev = mountpoint_to_dev(name); if (dev != NULL && is_dev(dev)) return (dev); return (NULL); } /* * growfs(8) is a utility which allows to increase the size of an existing * ufs file system. Currently this can only be done on unmounted file system. * It recognizes some command line options to specify the new desired size, * and it does some basic checkings. The old file system size is determined * and after some more checks like we can really access the new last block * on the disk etc. we calculate the new parameters for the superblock. After * having done this we just call growfs() which will do the work. * We still have to provide support for snapshots. Therefore we first have to * understand what data structures are always replicated in the snapshot on * creation, for all other blocks we touch during our procedure, we have to * keep the old blocks unchanged somewhere available for the snapshots. If we * are lucky, then we only have to handle our blocks to be relocated in that * way. * Also we have to consider in what order we actually update the critical * data structures of the file system to make sure, that in case of a disaster * fsck(8) is still able to restore any lost data. * The foreseen last step then will be to provide for growing even mounted * file systems. There we have to extend the mount() system call to provide * userland access to the file system locking facility. */ int main(int argc, char **argv) { DBG_FUNC("main") const char *device; const struct statfs *statfsp; uint64_t size = 0; off_t mediasize; int error, i, j, fsi, fso, ch, Nflag = 0, yflag = 0; char *p, reply[5], oldsizebuf[6], newsizebuf[6]; void *testbuf; DBG_ENTER; while ((ch = getopt(argc, argv, "Ns:vy")) != -1) { switch(ch) { case 'N': Nflag = 1; break; case 's': size = (off_t)strtoumax(optarg, &p, 0); if (p == NULL || *p == '\0') size *= DEV_BSIZE; else if (*p == 'b' || *p == 'B') ; /* do nothing */ else if (*p == 'k' || *p == 'K') size <<= 10; else if (*p == 'm' || *p == 'M') size <<= 20; else if (*p == 'g' || *p == 'G') size <<= 30; else if (*p == 't' || *p == 'T') { size <<= 30; size <<= 10; } else errx(1, "unknown suffix on -s argument"); break; case 'v': /* for compatibility to newfs */ break; case 'y': yflag = 1; break; case '?': /* FALLTHROUGH */ default: usage(); } } argc -= optind; argv += optind; if (argc != 1) usage(); /* * Now try to guess the device name. */ device = getdev(*argv); if (device == NULL) errx(1, "cannot find special device for %s", *argv); statfsp = dev_to_statfs(device); fsi = open(device, O_RDONLY); if (fsi < 0) err(1, "%s", device); /* * Try to guess the slice size if not specified. */ if (ioctl(fsi, DIOCGMEDIASIZE, &mediasize) == -1) err(1,"DIOCGMEDIASIZE"); /* * Check if that partition is suitable for growing a file system. */ if (mediasize < 1) errx(1, "partition is unavailable"); /* * Read the current superblock, and take a backup. */ for (i = 0; sblock_try[i] != -1; i++) { sblockloc = sblock_try[i] / DEV_BSIZE; rdfs(sblockloc, (size_t)SBLOCKSIZE, (void *)&(osblock), fsi); if ((osblock.fs_magic == FS_UFS1_MAGIC || (osblock.fs_magic == FS_UFS2_MAGIC && osblock.fs_sblockloc == sblock_try[i])) && osblock.fs_bsize <= MAXBSIZE && osblock.fs_bsize >= (int32_t) sizeof(struct fs)) break; } if (sblock_try[i] == -1) errx(1, "superblock not recognized"); memcpy((void *)&fsun1, (void *)&fsun2, sizeof(fsun2)); DBG_OPEN("/tmp/growfs.debug"); /* already here we need a superblock */ DBG_DUMP_FS(&sblock, "old sblock"); /* * Determine size to grow to. Default to the device size. */ if (size == 0) size = mediasize; else { if (size > (uint64_t)mediasize) { humanize_number(oldsizebuf, sizeof(oldsizebuf), size, "B", HN_AUTOSCALE, HN_B | HN_NOSPACE | HN_DECIMAL); humanize_number(newsizebuf, sizeof(newsizebuf), mediasize, "B", HN_AUTOSCALE, HN_B | HN_NOSPACE | HN_DECIMAL); errx(1, "requested size %s is larger " "than the available %s", oldsizebuf, newsizebuf); } } /* * Make sure the new size is a multiple of fs_fsize; /dev/ufssuspend * only supports fragment-aligned IO requests. */ size -= size % osblock.fs_fsize; if (size <= (uint64_t)(osblock.fs_size * osblock.fs_fsize)) { humanize_number(oldsizebuf, sizeof(oldsizebuf), osblock.fs_size * osblock.fs_fsize, "B", HN_AUTOSCALE, HN_B | HN_NOSPACE | HN_DECIMAL); humanize_number(newsizebuf, sizeof(newsizebuf), size, "B", HN_AUTOSCALE, HN_B | HN_NOSPACE | HN_DECIMAL); errx(1, "requested size %s is not larger than the current " "filesystem size %s", newsizebuf, oldsizebuf); } sblock.fs_size = dbtofsb(&osblock, size / DEV_BSIZE); sblock.fs_providersize = dbtofsb(&osblock, mediasize / DEV_BSIZE); /* * Are we really growing? */ if (osblock.fs_size >= sblock.fs_size) { errx(1, "we are not growing (%jd->%jd)", (intmax_t)osblock.fs_size, (intmax_t)sblock.fs_size); } /* * Check if we find an active snapshot. */ if (yflag == 0) { for (j = 0; j < FSMAXSNAP; j++) { if (sblock.fs_snapinum[j]) { errx(1, "active snapshot found in file system; " "please remove all snapshots before " "using growfs"); } if (!sblock.fs_snapinum[j]) /* list is dense */ break; } } if (yflag == 0 && Nflag == 0) { if (statfsp != NULL && (statfsp->f_flags & MNT_RDONLY) == 0) printf("Device is mounted read-write; resizing will " "result in temporary write suspension for %s.\n", statfsp->f_mntonname); printf("It's strongly recommended to make a backup " "before growing the file system.\n" "OK to grow filesystem on %s", device); if (statfsp != NULL) printf(", mounted on %s,", statfsp->f_mntonname); humanize_number(oldsizebuf, sizeof(oldsizebuf), osblock.fs_size * osblock.fs_fsize, "B", HN_AUTOSCALE, HN_B | HN_NOSPACE | HN_DECIMAL); humanize_number(newsizebuf, sizeof(newsizebuf), sblock.fs_size * sblock.fs_fsize, "B", HN_AUTOSCALE, HN_B | HN_NOSPACE | HN_DECIMAL); printf(" from %s to %s? [yes/no] ", oldsizebuf, newsizebuf); fflush(stdout); fgets(reply, (int)sizeof(reply), stdin); if (strcasecmp(reply, "yes\n")){ printf("Response other than \"yes\"; aborting\n"); exit(0); } } /* * Try to access our device for writing. If it's not mounted, * or mounted read-only, simply open it; otherwise, use UFS * suspension mechanism. */ if (Nflag) { fso = -1; } else { if (statfsp != NULL && (statfsp->f_flags & MNT_RDONLY) == 0) { fso = open(_PATH_UFSSUSPEND, O_RDWR); if (fso == -1) err(1, "unable to open %s", _PATH_UFSSUSPEND); error = ioctl(fso, UFSSUSPEND, &statfsp->f_fsid); if (error != 0) err(1, "UFSSUSPEND"); } else { fso = open(device, O_WRONLY); if (fso < 0) err(1, "%s", device); } } /* * Try to access our new last block in the file system. */ testbuf = malloc(sblock.fs_fsize); if (testbuf == NULL) err(1, "malloc"); rdfs((ufs2_daddr_t)((size - sblock.fs_fsize) / DEV_BSIZE), sblock.fs_fsize, testbuf, fsi); wtfs((ufs2_daddr_t)((size - sblock.fs_fsize) / DEV_BSIZE), sblock.fs_fsize, testbuf, fso, Nflag); free(testbuf); /* * Now calculate new superblock values and check for reasonable * bound for new file system size: * fs_size: is derived from user input * fs_dsize: should get updated in the routines creating or * updating the cylinder groups on the fly * fs_cstotal: should get updated in the routines creating or * updating the cylinder groups */ /* * Update the number of cylinders and cylinder groups in the file system. */ if (sblock.fs_magic == FS_UFS1_MAGIC) { sblock.fs_old_ncyl = sblock.fs_size * sblock.fs_old_nspf / sblock.fs_old_spc; if (sblock.fs_size * sblock.fs_old_nspf > sblock.fs_old_ncyl * sblock.fs_old_spc) sblock.fs_old_ncyl++; } sblock.fs_ncg = howmany(sblock.fs_size, sblock.fs_fpg); /* * Allocate last cylinder group only if there is enough room * for at least one data block. */ if (sblock.fs_size % sblock.fs_fpg != 0 && sblock.fs_size <= cgdmin(&sblock, sblock.fs_ncg - 1)) { humanize_number(oldsizebuf, sizeof(oldsizebuf), (sblock.fs_size % sblock.fs_fpg) * sblock.fs_fsize, "B", HN_AUTOSCALE, HN_B | HN_NOSPACE | HN_DECIMAL); warnx("no room to allocate last cylinder group; " "leaving %s unused", oldsizebuf); sblock.fs_ncg--; if (sblock.fs_magic == FS_UFS1_MAGIC) sblock.fs_old_ncyl = sblock.fs_ncg * sblock.fs_old_cpg; sblock.fs_size = sblock.fs_ncg * sblock.fs_fpg; } /* * Update the space for the cylinder group summary information in the * respective cylinder group data area. */ sblock.fs_cssize = fragroundup(&sblock, sblock.fs_ncg * sizeof(struct csum)); if (osblock.fs_size >= sblock.fs_size) errx(1, "not enough new space"); DBG_PRINT0("sblock calculated\n"); /* * Ok, everything prepared, so now let's do the tricks. */ growfs(fsi, fso, Nflag); close(fsi); if (fso > -1) { if (statfsp != NULL && (statfsp->f_flags & MNT_RDONLY) == 0) { error = ioctl(fso, UFSRESUME); if (error != 0) err(1, "UFSRESUME"); } error = close(fso); if (error != 0) err(1, "close"); if (statfsp != NULL && (statfsp->f_flags & MNT_RDONLY) != 0) mount_reload(statfsp); } DBG_CLOSE; DBG_LEAVE; return (0); } /* * Dump a line of usage. */ static void usage(void) { DBG_FUNC("usage") DBG_ENTER; fprintf(stderr, "usage: growfs [-Ny] [-s size] special | filesystem\n"); DBG_LEAVE; exit(1); } /* * This updates most parameters and the bitmap related to cluster. We have to * assume that sblock, osblock, acg are set up. */ static void updclst(int block) { DBG_FUNC("updclst") static int lcs = 0; DBG_ENTER; if (sblock.fs_contigsumsize < 1) /* no clustering */ return; /* * update cluster allocation map */ setbit(cg_clustersfree(&acg), block); /* * update cluster summary table */ if (!lcs) { /* * calculate size for the trailing cluster */ for (block--; lcs < sblock.fs_contigsumsize; block--, lcs++ ) { if (isclr(cg_clustersfree(&acg), block)) break; } } if (lcs < sblock.fs_contigsumsize) { if (lcs) cg_clustersum(&acg)[lcs]--; lcs++; cg_clustersum(&acg)[lcs]++; } DBG_LEAVE; return; } static void mount_reload(const struct statfs *stfs) { char errmsg[255]; struct iovec *iov; int iovlen; iov = NULL; iovlen = 0; *errmsg = '\0'; build_iovec(&iov, &iovlen, "fstype", __DECONST(char *, "ffs"), 4); build_iovec(&iov, &iovlen, "fspath", __DECONST(char *, stfs->f_mntonname), (size_t)-1); build_iovec(&iov, &iovlen, "errmsg", errmsg, sizeof(errmsg)); build_iovec(&iov, &iovlen, "update", NULL, 0); build_iovec(&iov, &iovlen, "reload", NULL, 0); if (nmount(iov, iovlen, stfs->f_flags) < 0) { errmsg[sizeof(errmsg) - 1] = '\0'; err(9, "%s: cannot reload filesystem%s%s", stfs->f_mntonname, *errmsg != '\0' ? ": " : "", errmsg); } } Index: head/sbin/newfs/mkfs.c =================================================================== --- head/sbin/newfs/mkfs.c (revision 313779) +++ head/sbin/newfs/mkfs.c (revision 313780) @@ -1,1157 +1,1158 @@ /* * Copyright (c) 2002 Networks Associates Technology, Inc. * All rights reserved. * * This software was developed for the FreeBSD Project by Marshall * Kirk McKusick and Network Associates Laboratories, the Security * Research Division of Network Associates, Inc. under DARPA/SPAWAR * contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA CHATS * research program. * * Copyright (c) 1980, 1989, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #if 0 #ifndef lint static char sccsid[] = "@(#)mkfs.c 8.11 (Berkeley) 5/3/95"; #endif /* not lint */ #endif #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "newfs.h" /* * make file system for cylinder-group style file systems */ #define UMASK 0755 #define POWEROF2(num) (((num) & ((num) - 1)) == 0) static struct csum *fscs; #define sblock disk.d_fs #define acg disk.d_cg union dinode { struct ufs1_dinode dp1; struct ufs2_dinode dp2; }; #define DIP(dp, field) \ ((sblock.fs_magic == FS_UFS1_MAGIC) ? \ (dp)->dp1.field : (dp)->dp2.field) static caddr_t iobuf; static long iobufsize; static ufs2_daddr_t alloc(int size, int mode); static int charsperline(void); static void clrblock(struct fs *, unsigned char *, int); static void fsinit(time_t); static int ilog2(int); static void initcg(int, time_t); static int isblock(struct fs *, unsigned char *, int); static void iput(union dinode *, ino_t); static int makedir(struct direct *, int); static void setblock(struct fs *, unsigned char *, int); static void wtfs(ufs2_daddr_t, int, char *); static u_int32_t newfs_random(void); static int do_sbwrite(struct uufsd *disk) { if (!disk->d_sblock) disk->d_sblock = disk->d_fs.fs_sblockloc / disk->d_bsize; return (pwrite(disk->d_fd, &disk->d_fs, SBLOCKSIZE, (off_t)((part_ofs + disk->d_sblock) * disk->d_bsize))); } void mkfs(struct partition *pp, char *fsys) { int fragsperinode, optimalfpg, origdensity, minfpg, lastminfpg; long i, j, csfrags; uint cg; time_t utime; quad_t sizepb; int width; ino_t maxinum; int minfragsperinode; /* minimum ratio of frags to inodes */ char tmpbuf[100]; /* XXX this will break in about 2,500 years */ union { struct fs fdummy; char cdummy[SBLOCKSIZE]; } dummy; #define fsdummy dummy.fdummy #define chdummy dummy.cdummy /* * Our blocks == sector size, and the version of UFS we are using is * specified by Oflag. */ disk.d_bsize = sectorsize; disk.d_ufs = Oflag; if (Rflag) utime = 1000000000; else time(&utime); sblock.fs_old_flags = FS_FLAGS_UPDATED; sblock.fs_flags = 0; if (Uflag) sblock.fs_flags |= FS_DOSOFTDEP; if (Lflag) strlcpy(sblock.fs_volname, volumelabel, MAXVOLLEN); if (Jflag) sblock.fs_flags |= FS_GJOURNAL; if (lflag) sblock.fs_flags |= FS_MULTILABEL; if (tflag) sblock.fs_flags |= FS_TRIM; /* * Validate the given file system size. * Verify that its last block can actually be accessed. * Convert to file system fragment sized units. */ if (fssize <= 0) { printf("preposterous size %jd\n", (intmax_t)fssize); exit(13); } wtfs(fssize - (realsectorsize / DEV_BSIZE), realsectorsize, (char *)&sblock); /* * collect and verify the file system density info */ sblock.fs_avgfilesize = avgfilesize; sblock.fs_avgfpdir = avgfilesperdir; if (sblock.fs_avgfilesize <= 0) printf("illegal expected average file size %d\n", sblock.fs_avgfilesize), exit(14); if (sblock.fs_avgfpdir <= 0) printf("illegal expected number of files per directory %d\n", sblock.fs_avgfpdir), exit(15); restart: /* * collect and verify the block and fragment sizes */ sblock.fs_bsize = bsize; sblock.fs_fsize = fsize; if (!POWEROF2(sblock.fs_bsize)) { printf("block size must be a power of 2, not %d\n", sblock.fs_bsize); exit(16); } if (!POWEROF2(sblock.fs_fsize)) { printf("fragment size must be a power of 2, not %d\n", sblock.fs_fsize); exit(17); } if (sblock.fs_fsize < sectorsize) { printf("increasing fragment size from %d to sector size (%d)\n", sblock.fs_fsize, sectorsize); sblock.fs_fsize = sectorsize; } if (sblock.fs_bsize > MAXBSIZE) { printf("decreasing block size from %d to maximum (%d)\n", sblock.fs_bsize, MAXBSIZE); sblock.fs_bsize = MAXBSIZE; } if (sblock.fs_bsize < MINBSIZE) { printf("increasing block size from %d to minimum (%d)\n", sblock.fs_bsize, MINBSIZE); sblock.fs_bsize = MINBSIZE; } if (sblock.fs_fsize > MAXBSIZE) { printf("decreasing fragment size from %d to maximum (%d)\n", sblock.fs_fsize, MAXBSIZE); sblock.fs_fsize = MAXBSIZE; } if (sblock.fs_bsize < sblock.fs_fsize) { printf("increasing block size from %d to fragment size (%d)\n", sblock.fs_bsize, sblock.fs_fsize); sblock.fs_bsize = sblock.fs_fsize; } if (sblock.fs_fsize * MAXFRAG < sblock.fs_bsize) { printf( "increasing fragment size from %d to block size / %d (%d)\n", sblock.fs_fsize, MAXFRAG, sblock.fs_bsize / MAXFRAG); sblock.fs_fsize = sblock.fs_bsize / MAXFRAG; } if (maxbsize == 0) maxbsize = bsize; if (maxbsize < bsize || !POWEROF2(maxbsize)) { sblock.fs_maxbsize = sblock.fs_bsize; printf("Extent size set to %d\n", sblock.fs_maxbsize); } else if (sblock.fs_maxbsize > FS_MAXCONTIG * sblock.fs_bsize) { sblock.fs_maxbsize = FS_MAXCONTIG * sblock.fs_bsize; printf("Extent size reduced to %d\n", sblock.fs_maxbsize); } else { sblock.fs_maxbsize = maxbsize; } /* * Maxcontig sets the default for the maximum number of blocks * that may be allocated sequentially. With file system clustering * it is possible to allocate contiguous blocks up to the maximum * transfer size permitted by the controller or buffering. */ if (maxcontig == 0) maxcontig = MAX(1, MAXPHYS / bsize); sblock.fs_maxcontig = maxcontig; if (sblock.fs_maxcontig < sblock.fs_maxbsize / sblock.fs_bsize) { sblock.fs_maxcontig = sblock.fs_maxbsize / sblock.fs_bsize; printf("Maxcontig raised to %d\n", sblock.fs_maxbsize); } if (sblock.fs_maxcontig > 1) sblock.fs_contigsumsize = MIN(sblock.fs_maxcontig,FS_MAXCONTIG); sblock.fs_bmask = ~(sblock.fs_bsize - 1); sblock.fs_fmask = ~(sblock.fs_fsize - 1); sblock.fs_qbmask = ~sblock.fs_bmask; sblock.fs_qfmask = ~sblock.fs_fmask; sblock.fs_bshift = ilog2(sblock.fs_bsize); sblock.fs_fshift = ilog2(sblock.fs_fsize); sblock.fs_frag = numfrags(&sblock, sblock.fs_bsize); sblock.fs_fragshift = ilog2(sblock.fs_frag); if (sblock.fs_frag > MAXFRAG) { printf("fragment size %d is still too small (can't happen)\n", sblock.fs_bsize / MAXFRAG); exit(21); } sblock.fs_fsbtodb = ilog2(sblock.fs_fsize / sectorsize); sblock.fs_size = fssize = dbtofsb(&sblock, fssize); sblock.fs_providersize = dbtofsb(&sblock, mediasize / sectorsize); /* * Before the filesystem is finally initialized, mark it * as incompletely initialized. */ sblock.fs_magic = FS_BAD_MAGIC; if (Oflag == 1) { sblock.fs_sblockloc = SBLOCK_UFS1; sblock.fs_nindir = sblock.fs_bsize / sizeof(ufs1_daddr_t); sblock.fs_inopb = sblock.fs_bsize / sizeof(struct ufs1_dinode); - sblock.fs_maxsymlinklen = ((NDADDR + NIADDR) * + sblock.fs_maxsymlinklen = ((UFS_NDADDR + UFS_NIADDR) * sizeof(ufs1_daddr_t)); sblock.fs_old_inodefmt = FS_44INODEFMT; sblock.fs_old_cgoffset = 0; sblock.fs_old_cgmask = 0xffffffff; sblock.fs_old_size = sblock.fs_size; sblock.fs_old_rotdelay = 0; sblock.fs_old_rps = 60; sblock.fs_old_nspf = sblock.fs_fsize / sectorsize; sblock.fs_old_cpg = 1; sblock.fs_old_interleave = 1; sblock.fs_old_trackskew = 0; sblock.fs_old_cpc = 0; sblock.fs_old_postblformat = 1; sblock.fs_old_nrpos = 1; } else { sblock.fs_sblockloc = SBLOCK_UFS2; sblock.fs_nindir = sblock.fs_bsize / sizeof(ufs2_daddr_t); sblock.fs_inopb = sblock.fs_bsize / sizeof(struct ufs2_dinode); - sblock.fs_maxsymlinklen = ((NDADDR + NIADDR) * + sblock.fs_maxsymlinklen = ((UFS_NDADDR + UFS_NIADDR) * sizeof(ufs2_daddr_t)); } sblock.fs_sblkno = roundup(howmany(sblock.fs_sblockloc + SBLOCKSIZE, sblock.fs_fsize), sblock.fs_frag); sblock.fs_cblkno = sblock.fs_sblkno + roundup(howmany(SBLOCKSIZE, sblock.fs_fsize), sblock.fs_frag); sblock.fs_iblkno = sblock.fs_cblkno + sblock.fs_frag; - sblock.fs_maxfilesize = sblock.fs_bsize * NDADDR - 1; - for (sizepb = sblock.fs_bsize, i = 0; i < NIADDR; i++) { + sblock.fs_maxfilesize = sblock.fs_bsize * UFS_NDADDR - 1; + for (sizepb = sblock.fs_bsize, i = 0; i < UFS_NIADDR; i++) { sizepb *= NINDIR(&sblock); sblock.fs_maxfilesize += sizepb; } /* * It's impossible to create a snapshot in case that fs_maxfilesize * is smaller than the fssize. */ if (sblock.fs_maxfilesize < (u_quad_t)fssize) { warnx("WARNING: You will be unable to create snapshots on this " "file system. Correct by using a larger blocksize."); } /* * Calculate the number of blocks to put into each cylinder group. * * This algorithm selects the number of blocks per cylinder * group. The first goal is to have at least enough data blocks * in each cylinder group to meet the density requirement. Once * this goal is achieved we try to expand to have at least * MINCYLGRPS cylinder groups. Once this goal is achieved, we * pack as many blocks into each cylinder group map as will fit. * * We start by calculating the smallest number of blocks that we * can put into each cylinder group. If this is too big, we reduce * the density until it fits. */ maxinum = (((int64_t)(1)) << 32) - INOPB(&sblock); minfragsperinode = 1 + fssize / maxinum; if (density == 0) { density = MAX(NFPI, minfragsperinode) * fsize; } else if (density < minfragsperinode * fsize) { origdensity = density; density = minfragsperinode * fsize; fprintf(stderr, "density increased from %d to %d\n", origdensity, density); } origdensity = density; for (;;) { fragsperinode = MAX(numfrags(&sblock, density), 1); if (fragsperinode < minfragsperinode) { bsize <<= 1; fsize <<= 1; printf("Block size too small for a file system %s %d\n", "of this size. Increasing blocksize to", bsize); goto restart; } minfpg = fragsperinode * INOPB(&sblock); if (minfpg > sblock.fs_size) minfpg = sblock.fs_size; sblock.fs_ipg = INOPB(&sblock); sblock.fs_fpg = roundup(sblock.fs_iblkno + sblock.fs_ipg / INOPF(&sblock), sblock.fs_frag); if (sblock.fs_fpg < minfpg) sblock.fs_fpg = minfpg; sblock.fs_ipg = roundup(howmany(sblock.fs_fpg, fragsperinode), INOPB(&sblock)); sblock.fs_fpg = roundup(sblock.fs_iblkno + sblock.fs_ipg / INOPF(&sblock), sblock.fs_frag); if (sblock.fs_fpg < minfpg) sblock.fs_fpg = minfpg; sblock.fs_ipg = roundup(howmany(sblock.fs_fpg, fragsperinode), INOPB(&sblock)); if (CGSIZE(&sblock) < (unsigned long)sblock.fs_bsize) break; density -= sblock.fs_fsize; } if (density != origdensity) printf("density reduced from %d to %d\n", origdensity, density); /* * Start packing more blocks into the cylinder group until * it cannot grow any larger, the number of cylinder groups * drops below MINCYLGRPS, or we reach the size requested. * For UFS1 inodes per cylinder group are stored in an int16_t * so fs_ipg is limited to 2^15 - 1. */ for ( ; sblock.fs_fpg < maxblkspercg; sblock.fs_fpg += sblock.fs_frag) { sblock.fs_ipg = roundup(howmany(sblock.fs_fpg, fragsperinode), INOPB(&sblock)); if (Oflag > 1 || (Oflag == 1 && sblock.fs_ipg <= 0x7fff)) { if (sblock.fs_size / sblock.fs_fpg < MINCYLGRPS) break; if (CGSIZE(&sblock) < (unsigned long)sblock.fs_bsize) continue; if (CGSIZE(&sblock) == (unsigned long)sblock.fs_bsize) break; } sblock.fs_fpg -= sblock.fs_frag; sblock.fs_ipg = roundup(howmany(sblock.fs_fpg, fragsperinode), INOPB(&sblock)); break; } /* * Check to be sure that the last cylinder group has enough blocks * to be viable. If it is too small, reduce the number of blocks * per cylinder group which will have the effect of moving more * blocks into the last cylinder group. */ optimalfpg = sblock.fs_fpg; for (;;) { sblock.fs_ncg = howmany(sblock.fs_size, sblock.fs_fpg); lastminfpg = roundup(sblock.fs_iblkno + sblock.fs_ipg / INOPF(&sblock), sblock.fs_frag); if (sblock.fs_size < lastminfpg) { printf("Filesystem size %jd < minimum size of %d\n", (intmax_t)sblock.fs_size, lastminfpg); exit(28); } if (sblock.fs_size % sblock.fs_fpg >= lastminfpg || sblock.fs_size % sblock.fs_fpg == 0) break; sblock.fs_fpg -= sblock.fs_frag; sblock.fs_ipg = roundup(howmany(sblock.fs_fpg, fragsperinode), INOPB(&sblock)); } if (optimalfpg != sblock.fs_fpg) printf("Reduced frags per cylinder group from %d to %d %s\n", optimalfpg, sblock.fs_fpg, "to enlarge last cyl group"); sblock.fs_cgsize = fragroundup(&sblock, CGSIZE(&sblock)); sblock.fs_dblkno = sblock.fs_iblkno + sblock.fs_ipg / INOPF(&sblock); if (Oflag == 1) { sblock.fs_old_spc = sblock.fs_fpg * sblock.fs_old_nspf; sblock.fs_old_nsect = sblock.fs_old_spc; sblock.fs_old_npsect = sblock.fs_old_spc; sblock.fs_old_ncyl = sblock.fs_ncg; } /* * fill in remaining fields of the super block */ sblock.fs_csaddr = cgdmin(&sblock, 0); sblock.fs_cssize = fragroundup(&sblock, sblock.fs_ncg * sizeof(struct csum)); fscs = (struct csum *)calloc(1, sblock.fs_cssize); if (fscs == NULL) errx(31, "calloc failed"); sblock.fs_sbsize = fragroundup(&sblock, sizeof(struct fs)); if (sblock.fs_sbsize > SBLOCKSIZE) sblock.fs_sbsize = SBLOCKSIZE; sblock.fs_minfree = minfree; if (metaspace > 0 && metaspace < sblock.fs_fpg / 2) sblock.fs_metaspace = blknum(&sblock, metaspace); else if (metaspace != -1) /* reserve half of minfree for metadata blocks */ sblock.fs_metaspace = blknum(&sblock, (sblock.fs_fpg * minfree) / 200); if (maxbpg == 0) sblock.fs_maxbpg = MAXBLKPG(sblock.fs_bsize); else sblock.fs_maxbpg = maxbpg; sblock.fs_optim = opt; sblock.fs_cgrotor = 0; sblock.fs_pendingblocks = 0; sblock.fs_pendinginodes = 0; sblock.fs_fmod = 0; sblock.fs_ronly = 0; sblock.fs_state = 0; sblock.fs_clean = 1; sblock.fs_id[0] = (long)utime; sblock.fs_id[1] = newfs_random(); sblock.fs_fsmnt[0] = '\0'; csfrags = howmany(sblock.fs_cssize, sblock.fs_fsize); sblock.fs_dsize = sblock.fs_size - sblock.fs_sblkno - sblock.fs_ncg * (sblock.fs_dblkno - sblock.fs_sblkno); sblock.fs_cstotal.cs_nbfree = fragstoblks(&sblock, sblock.fs_dsize) - howmany(csfrags, sblock.fs_frag); sblock.fs_cstotal.cs_nffree = fragnum(&sblock, sblock.fs_size) + (fragnum(&sblock, csfrags) > 0 ? sblock.fs_frag - fragnum(&sblock, csfrags) : 0); - sblock.fs_cstotal.cs_nifree = sblock.fs_ncg * sblock.fs_ipg - ROOTINO; + sblock.fs_cstotal.cs_nifree = + sblock.fs_ncg * sblock.fs_ipg - UFS_ROOTINO; sblock.fs_cstotal.cs_ndir = 0; sblock.fs_dsize -= csfrags; sblock.fs_time = utime; if (Oflag == 1) { sblock.fs_old_time = utime; sblock.fs_old_dsize = sblock.fs_dsize; sblock.fs_old_csaddr = sblock.fs_csaddr; sblock.fs_old_cstotal.cs_ndir = sblock.fs_cstotal.cs_ndir; sblock.fs_old_cstotal.cs_nbfree = sblock.fs_cstotal.cs_nbfree; sblock.fs_old_cstotal.cs_nifree = sblock.fs_cstotal.cs_nifree; sblock.fs_old_cstotal.cs_nffree = sblock.fs_cstotal.cs_nffree; } /* * Dump out summary information about file system. */ # define B2MBFACTOR (1 / (1024.0 * 1024.0)) printf("%s: %.1fMB (%jd sectors) block size %d, fragment size %d\n", fsys, (float)sblock.fs_size * sblock.fs_fsize * B2MBFACTOR, (intmax_t)fsbtodb(&sblock, sblock.fs_size), sblock.fs_bsize, sblock.fs_fsize); printf("\tusing %d cylinder groups of %.2fMB, %d blks, %d inodes.\n", sblock.fs_ncg, (float)sblock.fs_fpg * sblock.fs_fsize * B2MBFACTOR, sblock.fs_fpg / sblock.fs_frag, sblock.fs_ipg); if (sblock.fs_flags & FS_DOSOFTDEP) printf("\twith soft updates\n"); # undef B2MBFACTOR if (Eflag && !Nflag) { printf("Erasing sectors [%jd...%jd]\n", sblock.fs_sblockloc / disk.d_bsize, fsbtodb(&sblock, sblock.fs_size) - 1); berase(&disk, sblock.fs_sblockloc / disk.d_bsize, sblock.fs_size * sblock.fs_fsize - sblock.fs_sblockloc); } /* * Wipe out old UFS1 superblock(s) if necessary. */ if (!Nflag && Oflag != 1) { i = bread(&disk, part_ofs + SBLOCK_UFS1 / disk.d_bsize, chdummy, SBLOCKSIZE); if (i == -1) err(1, "can't read old UFS1 superblock: %s", disk.d_error); if (fsdummy.fs_magic == FS_UFS1_MAGIC) { fsdummy.fs_magic = 0; bwrite(&disk, part_ofs + SBLOCK_UFS1 / disk.d_bsize, chdummy, SBLOCKSIZE); for (cg = 0; cg < fsdummy.fs_ncg; cg++) { if (fsbtodb(&fsdummy, cgsblock(&fsdummy, cg)) > fssize) break; bwrite(&disk, part_ofs + fsbtodb(&fsdummy, cgsblock(&fsdummy, cg)), chdummy, SBLOCKSIZE); } } } if (!Nflag) do_sbwrite(&disk); if (Xflag == 1) { printf("** Exiting on Xflag 1\n"); exit(0); } if (Xflag == 2) printf("** Leaving BAD MAGIC on Xflag 2\n"); else sblock.fs_magic = (Oflag != 1) ? FS_UFS2_MAGIC : FS_UFS1_MAGIC; /* * Now build the cylinders group blocks and * then print out indices of cylinder groups. */ printf("super-block backups (for fsck_ffs -b #) at:\n"); i = 0; width = charsperline(); /* * allocate space for superblock, cylinder group map, and * two sets of inode blocks. */ if (sblock.fs_bsize < SBLOCKSIZE) iobufsize = SBLOCKSIZE + 3 * sblock.fs_bsize; else iobufsize = 4 * sblock.fs_bsize; if ((iobuf = calloc(1, iobufsize)) == 0) { printf("Cannot allocate I/O buffer\n"); exit(38); } /* * Make a copy of the superblock into the buffer that we will be * writing out in each cylinder group. */ bcopy((char *)&sblock, iobuf, SBLOCKSIZE); for (cg = 0; cg < sblock.fs_ncg; cg++) { initcg(cg, utime); j = snprintf(tmpbuf, sizeof(tmpbuf), " %jd%s", (intmax_t)fsbtodb(&sblock, cgsblock(&sblock, cg)), cg < (sblock.fs_ncg-1) ? "," : ""); if (j < 0) tmpbuf[j = 0] = '\0'; if (i + j >= width) { printf("\n"); i = 0; } i += j; printf("%s", tmpbuf); fflush(stdout); } printf("\n"); if (Nflag) exit(0); /* * Now construct the initial file system, * then write out the super-block. */ fsinit(utime); if (Oflag == 1) { sblock.fs_old_cstotal.cs_ndir = sblock.fs_cstotal.cs_ndir; sblock.fs_old_cstotal.cs_nbfree = sblock.fs_cstotal.cs_nbfree; sblock.fs_old_cstotal.cs_nifree = sblock.fs_cstotal.cs_nifree; sblock.fs_old_cstotal.cs_nffree = sblock.fs_cstotal.cs_nffree; } if (Xflag == 3) { printf("** Exiting on Xflag 3\n"); exit(0); } if (!Nflag) { do_sbwrite(&disk); /* * For UFS1 filesystems with a blocksize of 64K, the first * alternate superblock resides at the location used for * the default UFS2 superblock. As there is a valid * superblock at this location, the boot code will use * it as its first choice. Thus we have to ensure that * all of its statistcs on usage are correct. */ if (Oflag == 1 && sblock.fs_bsize == 65536) wtfs(fsbtodb(&sblock, cgsblock(&sblock, 0)), sblock.fs_bsize, (char *)&sblock); } for (i = 0; i < sblock.fs_cssize; i += sblock.fs_bsize) wtfs(fsbtodb(&sblock, sblock.fs_csaddr + numfrags(&sblock, i)), MIN(sblock.fs_cssize - i, sblock.fs_bsize), ((char *)fscs) + i); /* * Update information about this partition in pack * label, to that it may be updated on disk. */ if (pp != NULL) { pp->p_fstype = FS_BSDFFS; pp->p_fsize = sblock.fs_fsize; pp->p_frag = sblock.fs_frag; pp->p_cpg = sblock.fs_fpg; } } /* * Initialize a cylinder group. */ void initcg(int cylno, time_t utime) { long blkno, start; uint i, j, d, dlower, dupper; ufs2_daddr_t cbase, dmax; struct ufs1_dinode *dp1; struct ufs2_dinode *dp2; struct csum *cs; /* * Determine block bounds for cylinder group. * Allow space for super block summary information in first * cylinder group. */ cbase = cgbase(&sblock, cylno); dmax = cbase + sblock.fs_fpg; if (dmax > sblock.fs_size) dmax = sblock.fs_size; dlower = cgsblock(&sblock, cylno) - cbase; dupper = cgdmin(&sblock, cylno) - cbase; if (cylno == 0) dupper += howmany(sblock.fs_cssize, sblock.fs_fsize); cs = &fscs[cylno]; memset(&acg, 0, sblock.fs_cgsize); acg.cg_time = utime; acg.cg_magic = CG_MAGIC; acg.cg_cgx = cylno; acg.cg_niblk = sblock.fs_ipg; acg.cg_initediblk = MIN(sblock.fs_ipg, 2 * INOPB(&sblock)); acg.cg_ndblk = dmax - cbase; if (sblock.fs_contigsumsize > 0) acg.cg_nclusterblks = acg.cg_ndblk / sblock.fs_frag; start = &acg.cg_space[0] - (u_char *)(&acg.cg_firstfield); if (Oflag == 2) { acg.cg_iusedoff = start; } else { acg.cg_old_ncyl = sblock.fs_old_cpg; acg.cg_old_time = acg.cg_time; acg.cg_time = 0; acg.cg_old_niblk = acg.cg_niblk; acg.cg_niblk = 0; acg.cg_initediblk = 0; acg.cg_old_btotoff = start; acg.cg_old_boff = acg.cg_old_btotoff + sblock.fs_old_cpg * sizeof(int32_t); acg.cg_iusedoff = acg.cg_old_boff + sblock.fs_old_cpg * sizeof(u_int16_t); } acg.cg_freeoff = acg.cg_iusedoff + howmany(sblock.fs_ipg, CHAR_BIT); acg.cg_nextfreeoff = acg.cg_freeoff + howmany(sblock.fs_fpg, CHAR_BIT); if (sblock.fs_contigsumsize > 0) { acg.cg_clustersumoff = roundup(acg.cg_nextfreeoff, sizeof(u_int32_t)); acg.cg_clustersumoff -= sizeof(u_int32_t); acg.cg_clusteroff = acg.cg_clustersumoff + (sblock.fs_contigsumsize + 1) * sizeof(u_int32_t); acg.cg_nextfreeoff = acg.cg_clusteroff + howmany(fragstoblks(&sblock, sblock.fs_fpg), CHAR_BIT); } if (acg.cg_nextfreeoff > (unsigned)sblock.fs_cgsize) { printf("Panic: cylinder group too big\n"); exit(37); } acg.cg_cs.cs_nifree += sblock.fs_ipg; if (cylno == 0) - for (i = 0; i < (long)ROOTINO; i++) { + for (i = 0; i < (long)UFS_ROOTINO; i++) { setbit(cg_inosused(&acg), i); acg.cg_cs.cs_nifree--; } if (cylno > 0) { /* * In cylno 0, beginning space is reserved * for boot and super blocks. */ for (d = 0; d < dlower; d += sblock.fs_frag) { blkno = d / sblock.fs_frag; setblock(&sblock, cg_blksfree(&acg), blkno); if (sblock.fs_contigsumsize > 0) setbit(cg_clustersfree(&acg), blkno); acg.cg_cs.cs_nbfree++; } } if ((i = dupper % sblock.fs_frag)) { acg.cg_frsum[sblock.fs_frag - i]++; for (d = dupper + sblock.fs_frag - i; dupper < d; dupper++) { setbit(cg_blksfree(&acg), dupper); acg.cg_cs.cs_nffree++; } } for (d = dupper; d + sblock.fs_frag <= acg.cg_ndblk; d += sblock.fs_frag) { blkno = d / sblock.fs_frag; setblock(&sblock, cg_blksfree(&acg), blkno); if (sblock.fs_contigsumsize > 0) setbit(cg_clustersfree(&acg), blkno); acg.cg_cs.cs_nbfree++; } if (d < acg.cg_ndblk) { acg.cg_frsum[acg.cg_ndblk - d]++; for (; d < acg.cg_ndblk; d++) { setbit(cg_blksfree(&acg), d); acg.cg_cs.cs_nffree++; } } if (sblock.fs_contigsumsize > 0) { int32_t *sump = cg_clustersum(&acg); u_char *mapp = cg_clustersfree(&acg); int map = *mapp++; int bit = 1; int run = 0; for (i = 0; i < acg.cg_nclusterblks; i++) { if ((map & bit) != 0) run++; else if (run != 0) { if (run > sblock.fs_contigsumsize) run = sblock.fs_contigsumsize; sump[run]++; run = 0; } if ((i & (CHAR_BIT - 1)) != CHAR_BIT - 1) bit <<= 1; else { map = *mapp++; bit = 1; } } if (run != 0) { if (run > sblock.fs_contigsumsize) run = sblock.fs_contigsumsize; sump[run]++; } } *cs = acg.cg_cs; /* * Write out the duplicate super block, the cylinder group map * and two blocks worth of inodes in a single write. */ start = MAX(sblock.fs_bsize, SBLOCKSIZE); bcopy((char *)&acg, &iobuf[start], sblock.fs_cgsize); start += sblock.fs_bsize; dp1 = (struct ufs1_dinode *)(&iobuf[start]); dp2 = (struct ufs2_dinode *)(&iobuf[start]); for (i = 0; i < acg.cg_initediblk; i++) { if (sblock.fs_magic == FS_UFS1_MAGIC) { dp1->di_gen = newfs_random(); dp1++; } else { dp2->di_gen = newfs_random(); dp2++; } } wtfs(fsbtodb(&sblock, cgsblock(&sblock, cylno)), iobufsize, iobuf); /* * For the old file system, we have to initialize all the inodes. */ if (Oflag == 1) { for (i = 2 * sblock.fs_frag; i < sblock.fs_ipg / INOPF(&sblock); i += sblock.fs_frag) { dp1 = (struct ufs1_dinode *)(&iobuf[start]); for (j = 0; j < INOPB(&sblock); j++) { dp1->di_gen = newfs_random(); dp1++; } wtfs(fsbtodb(&sblock, cgimin(&sblock, cylno) + i), sblock.fs_bsize, &iobuf[start]); } } } /* * initialize the file system */ #define ROOTLINKCNT 3 static struct direct root_dir[] = { - { ROOTINO, sizeof(struct direct), DT_DIR, 1, "." }, - { ROOTINO, sizeof(struct direct), DT_DIR, 2, ".." }, - { ROOTINO + 1, sizeof(struct direct), DT_DIR, 5, ".snap" }, + { UFS_ROOTINO, sizeof(struct direct), DT_DIR, 1, "." }, + { UFS_ROOTINO, sizeof(struct direct), DT_DIR, 2, ".." }, + { UFS_ROOTINO + 1, sizeof(struct direct), DT_DIR, 5, ".snap" }, }; #define SNAPLINKCNT 2 static struct direct snap_dir[] = { - { ROOTINO + 1, sizeof(struct direct), DT_DIR, 1, "." }, - { ROOTINO, sizeof(struct direct), DT_DIR, 2, ".." }, + { UFS_ROOTINO + 1, sizeof(struct direct), DT_DIR, 1, "." }, + { UFS_ROOTINO, sizeof(struct direct), DT_DIR, 2, ".." }, }; void fsinit(time_t utime) { union dinode node; struct group *grp; gid_t gid; int entries; memset(&node, 0, sizeof node); if ((grp = getgrnam("operator")) != NULL) { gid = grp->gr_gid; } else { warnx("Cannot retrieve operator gid, using gid 0."); gid = 0; } entries = (nflag) ? ROOTLINKCNT - 1: ROOTLINKCNT; if (sblock.fs_magic == FS_UFS1_MAGIC) { /* * initialize the node */ node.dp1.di_atime = utime; node.dp1.di_mtime = utime; node.dp1.di_ctime = utime; /* * create the root directory */ node.dp1.di_mode = IFDIR | UMASK; node.dp1.di_nlink = entries; node.dp1.di_size = makedir(root_dir, entries); node.dp1.di_db[0] = alloc(sblock.fs_fsize, node.dp1.di_mode); node.dp1.di_blocks = btodb(fragroundup(&sblock, node.dp1.di_size)); wtfs(fsbtodb(&sblock, node.dp1.di_db[0]), sblock.fs_fsize, iobuf); - iput(&node, ROOTINO); + iput(&node, UFS_ROOTINO); if (!nflag) { /* * create the .snap directory */ node.dp1.di_mode |= 020; node.dp1.di_gid = gid; node.dp1.di_nlink = SNAPLINKCNT; node.dp1.di_size = makedir(snap_dir, SNAPLINKCNT); node.dp1.di_db[0] = alloc(sblock.fs_fsize, node.dp1.di_mode); node.dp1.di_blocks = btodb(fragroundup(&sblock, node.dp1.di_size)); wtfs(fsbtodb(&sblock, node.dp1.di_db[0]), sblock.fs_fsize, iobuf); - iput(&node, ROOTINO + 1); + iput(&node, UFS_ROOTINO + 1); } } else { /* * initialize the node */ node.dp2.di_atime = utime; node.dp2.di_mtime = utime; node.dp2.di_ctime = utime; node.dp2.di_birthtime = utime; /* * create the root directory */ node.dp2.di_mode = IFDIR | UMASK; node.dp2.di_nlink = entries; node.dp2.di_size = makedir(root_dir, entries); node.dp2.di_db[0] = alloc(sblock.fs_fsize, node.dp2.di_mode); node.dp2.di_blocks = btodb(fragroundup(&sblock, node.dp2.di_size)); wtfs(fsbtodb(&sblock, node.dp2.di_db[0]), sblock.fs_fsize, iobuf); - iput(&node, ROOTINO); + iput(&node, UFS_ROOTINO); if (!nflag) { /* * create the .snap directory */ node.dp2.di_mode |= 020; node.dp2.di_gid = gid; node.dp2.di_nlink = SNAPLINKCNT; node.dp2.di_size = makedir(snap_dir, SNAPLINKCNT); node.dp2.di_db[0] = alloc(sblock.fs_fsize, node.dp2.di_mode); node.dp2.di_blocks = btodb(fragroundup(&sblock, node.dp2.di_size)); wtfs(fsbtodb(&sblock, node.dp2.di_db[0]), sblock.fs_fsize, iobuf); - iput(&node, ROOTINO + 1); + iput(&node, UFS_ROOTINO + 1); } } } /* * construct a set of directory entries in "iobuf". * return size of directory. */ int makedir(struct direct *protodir, int entries) { char *cp; int i, spcleft; spcleft = DIRBLKSIZ; memset(iobuf, 0, DIRBLKSIZ); for (cp = iobuf, i = 0; i < entries - 1; i++) { protodir[i].d_reclen = DIRSIZ(0, &protodir[i]); memmove(cp, &protodir[i], protodir[i].d_reclen); cp += protodir[i].d_reclen; spcleft -= protodir[i].d_reclen; } protodir[i].d_reclen = spcleft; memmove(cp, &protodir[i], DIRSIZ(0, &protodir[i])); return (DIRBLKSIZ); } /* * allocate a block or frag */ ufs2_daddr_t alloc(int size, int mode) { int i, blkno, frag; uint d; bread(&disk, part_ofs + fsbtodb(&sblock, cgtod(&sblock, 0)), (char *)&acg, sblock.fs_cgsize); if (acg.cg_magic != CG_MAGIC) { printf("cg 0: bad magic number\n"); exit(38); } if (acg.cg_cs.cs_nbfree == 0) { printf("first cylinder group ran out of space\n"); exit(39); } for (d = 0; d < acg.cg_ndblk; d += sblock.fs_frag) if (isblock(&sblock, cg_blksfree(&acg), d / sblock.fs_frag)) goto goth; printf("internal error: can't find block in cyl 0\n"); exit(40); goth: blkno = fragstoblks(&sblock, d); clrblock(&sblock, cg_blksfree(&acg), blkno); if (sblock.fs_contigsumsize > 0) clrbit(cg_clustersfree(&acg), blkno); acg.cg_cs.cs_nbfree--; sblock.fs_cstotal.cs_nbfree--; fscs[0].cs_nbfree--; if (mode & IFDIR) { acg.cg_cs.cs_ndir++; sblock.fs_cstotal.cs_ndir++; fscs[0].cs_ndir++; } if (size != sblock.fs_bsize) { frag = howmany(size, sblock.fs_fsize); fscs[0].cs_nffree += sblock.fs_frag - frag; sblock.fs_cstotal.cs_nffree += sblock.fs_frag - frag; acg.cg_cs.cs_nffree += sblock.fs_frag - frag; acg.cg_frsum[sblock.fs_frag - frag]++; for (i = frag; i < sblock.fs_frag; i++) setbit(cg_blksfree(&acg), d + i); } /* XXX cgwrite(&disk, 0)??? */ wtfs(fsbtodb(&sblock, cgtod(&sblock, 0)), sblock.fs_cgsize, (char *)&acg); return ((ufs2_daddr_t)d); } /* * Allocate an inode on the disk */ void iput(union dinode *ip, ino_t ino) { ufs2_daddr_t d; bread(&disk, part_ofs + fsbtodb(&sblock, cgtod(&sblock, 0)), (char *)&acg, sblock.fs_cgsize); if (acg.cg_magic != CG_MAGIC) { printf("cg 0: bad magic number\n"); exit(31); } acg.cg_cs.cs_nifree--; setbit(cg_inosused(&acg), ino); wtfs(fsbtodb(&sblock, cgtod(&sblock, 0)), sblock.fs_cgsize, (char *)&acg); sblock.fs_cstotal.cs_nifree--; fscs[0].cs_nifree--; if (ino >= (unsigned long)sblock.fs_ipg * sblock.fs_ncg) { printf("fsinit: inode value out of range (%ju).\n", (uintmax_t)ino); exit(32); } d = fsbtodb(&sblock, ino_to_fsba(&sblock, ino)); bread(&disk, part_ofs + d, (char *)iobuf, sblock.fs_bsize); if (sblock.fs_magic == FS_UFS1_MAGIC) ((struct ufs1_dinode *)iobuf)[ino_to_fsbo(&sblock, ino)] = ip->dp1; else ((struct ufs2_dinode *)iobuf)[ino_to_fsbo(&sblock, ino)] = ip->dp2; wtfs(d, sblock.fs_bsize, (char *)iobuf); } /* * possibly write to disk */ static void wtfs(ufs2_daddr_t bno, int size, char *bf) { if (Nflag) return; if (bwrite(&disk, part_ofs + bno, bf, size) < 0) err(36, "wtfs: %d bytes at sector %jd", size, (intmax_t)bno); } /* * check if a block is available */ static int isblock(struct fs *fs, unsigned char *cp, int h) { unsigned char mask; switch (fs->fs_frag) { case 8: return (cp[h] == 0xff); case 4: mask = 0x0f << ((h & 0x1) << 2); return ((cp[h >> 1] & mask) == mask); case 2: mask = 0x03 << ((h & 0x3) << 1); return ((cp[h >> 2] & mask) == mask); case 1: mask = 0x01 << (h & 0x7); return ((cp[h >> 3] & mask) == mask); default: fprintf(stderr, "isblock bad fs_frag %d\n", fs->fs_frag); return (0); } } /* * take a block out of the map */ static void clrblock(struct fs *fs, unsigned char *cp, int h) { switch ((fs)->fs_frag) { case 8: cp[h] = 0; return; case 4: cp[h >> 1] &= ~(0x0f << ((h & 0x1) << 2)); return; case 2: cp[h >> 2] &= ~(0x03 << ((h & 0x3) << 1)); return; case 1: cp[h >> 3] &= ~(0x01 << (h & 0x7)); return; default: fprintf(stderr, "clrblock bad fs_frag %d\n", fs->fs_frag); return; } } /* * put a block into the map */ static void setblock(struct fs *fs, unsigned char *cp, int h) { switch (fs->fs_frag) { case 8: cp[h] = 0xff; return; case 4: cp[h >> 1] |= (0x0f << ((h & 0x1) << 2)); return; case 2: cp[h >> 2] |= (0x03 << ((h & 0x3) << 1)); return; case 1: cp[h >> 3] |= (0x01 << (h & 0x7)); return; default: fprintf(stderr, "setblock bad fs_frag %d\n", fs->fs_frag); return; } } /* * Determine the number of characters in a * single line. */ static int charsperline(void) { int columns; char *cp; struct winsize ws; columns = 0; if (ioctl(0, TIOCGWINSZ, &ws) != -1) columns = ws.ws_col; if (columns == 0 && (cp = getenv("COLUMNS"))) columns = atoi(cp); if (columns == 0) columns = 80; /* last resort */ return (columns); } static int ilog2(int val) { u_int n; for (n = 0; n < sizeof(n) * CHAR_BIT; n++) if (1 << n == val) return (n); errx(1, "ilog2: %d is not a power of 2\n", val); } /* * For the regression test, return predictable random values. * Otherwise use a true random number generator. */ static u_int32_t newfs_random(void) { static int nextnum = 1; if (Rflag) return (nextnum++); return (arc4random()); } Index: head/sbin/newfs_nandfs/newfs_nandfs.c =================================================================== --- head/sbin/newfs_nandfs/newfs_nandfs.c (revision 313779) +++ head/sbin/newfs_nandfs/newfs_nandfs.c (revision 313780) @@ -1,1179 +1,1179 @@ /*- * Copyright (c) 2010-2012 Semihalf. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define DEBUG #undef DEBUG #ifdef DEBUG #define debug(fmt, args...) do { \ printf("nandfs:" fmt "\n", ##args); } while (0) #else #define debug(fmt, args...) #endif #define NANDFS_FIRST_BLOCK nandfs_first_block() #define NANDFS_FIRST_CNO 1 #define NANDFS_BLOCK_BAD 1 #define NANDFS_BLOCK_GOOD 0 struct file_info { uint64_t ino; const char *name; uint32_t mode; uint64_t size; uint8_t nblocks; uint32_t *blocks; struct nandfs_inode *inode; }; static struct file_info user_files[] = { { NANDFS_ROOT_INO, NULL, S_IFDIR | 0755, 0, 1, NULL, NULL }, }; static struct file_info ifile = { NANDFS_IFILE_INO, NULL, 0, 0, -1, NULL, NULL }; static struct file_info sufile = { NANDFS_SUFILE_INO, NULL, 0, 0, -1, NULL, NULL }; static struct file_info cpfile = { NANDFS_CPFILE_INO, NULL, 0, 0, -1, NULL, NULL }; static struct file_info datfile = { NANDFS_DAT_INO, NULL, 0, 0, -1, NULL, NULL }; struct nandfs_block { LIST_ENTRY(nandfs_block) block_link; uint32_t number; uint64_t offset; void *data; }; static LIST_HEAD(, nandfs_block) block_head = LIST_HEAD_INITIALIZER(&block_head); /* Storage geometry */ static off_t mediasize; static ssize_t sectorsize; static uint64_t nsegments; static uint64_t erasesize; static uint64_t segsize; static struct nandfs_fsdata fsdata; static struct nandfs_super_block super_block; static int is_nand; /* Nandfs parameters */ static size_t blocksize = NANDFS_DEF_BLOCKSIZE; static long blocks_per_segment; static long rsv_segment_percent = 5; static time_t nandfs_time; static uint32_t bad_segments_count = 0; static uint32_t *bad_segments = NULL; static uint8_t fsdata_blocks_state[NANDFS_NFSAREAS]; static u_char *volumelabel = NULL; static struct nandfs_super_root *sr; static uint32_t nuserfiles; static uint32_t seg_nblocks; static uint32_t seg_endblock; #define SIZE_TO_BLOCK(size) howmany(size, blocksize) static uint32_t nandfs_first_block(void) { uint32_t i, first_free, start_bad_segments = 0; for (i = 0; i < bad_segments_count; i++) { if (i == bad_segments[i]) start_bad_segments++; else break; } first_free = SIZE_TO_BLOCK(NANDFS_DATA_OFFSET_BYTES(erasesize) + (start_bad_segments * segsize)); if (first_free < (uint32_t)blocks_per_segment) return (blocks_per_segment); else return (first_free); } static void usage(void) { fprintf(stderr, "usage: newfs_nandfs [ -options ] device\n" "where the options are:\n" "\t-b block-size\n" "\t-B blocks-per-segment\n" "\t-L volume label\n" "\t-m reserved-segments-percentage\n"); exit(1); } static int nandfs_log2(unsigned n) { unsigned count; /* * N.B. this function will return 0 if supplied 0. */ for (count = 0; n/2; count++) n /= 2; return count; } /* from NetBSD's src/sys/net/if_ethersubr.c */ static uint32_t crc32_le(uint32_t crc, const uint8_t *buf, size_t len) { static const uint32_t crctab[] = { 0x00000000, 0x1db71064, 0x3b6e20c8, 0x26d930ac, 0x76dc4190, 0x6b6b51f4, 0x4db26158, 0x5005713c, 0xedb88320, 0xf00f9344, 0xd6d6a3e8, 0xcb61b38c, 0x9b64c2b0, 0x86d3d2d4, 0xa00ae278, 0xbdbdf21c }; size_t i; crc = crc ^ ~0U; for (i = 0; i < len; i++) { crc ^= buf[i]; crc = (crc >> 4) ^ crctab[crc & 0xf]; crc = (crc >> 4) ^ crctab[crc & 0xf]; } return (crc ^ ~0U); } static void * get_block(uint32_t block_nr, uint64_t offset) { struct nandfs_block *block, *new_block; LIST_FOREACH(block, &block_head, block_link) { if (block->number == block_nr) return block->data; } debug("allocating block %x\n", block_nr); new_block = malloc(sizeof(*block)); if (!new_block) err(1, "cannot allocate block"); new_block->number = block_nr; new_block->offset = offset; new_block->data = malloc(blocksize); if (!new_block->data) err(1, "cannot allocate block data"); memset(new_block->data, 0, blocksize); LIST_INSERT_HEAD(&block_head, new_block, block_link); return (new_block->data); } static int nandfs_seg_usage_blk_offset(uint64_t seg, uint64_t *blk, uint64_t *offset) { uint64_t off; uint16_t seg_size; seg_size = sizeof(struct nandfs_segment_usage); off = roundup(sizeof(struct nandfs_sufile_header), seg_size); off += (seg * seg_size); *blk = off / blocksize; *offset = (off % blocksize) / seg_size; return (0); } static uint32_t segment_size(void) { u_int size; size = sizeof(struct nandfs_segment_summary ); size += seg_nblocks * sizeof(struct nandfs_binfo_v); if (size > blocksize) err(1, "segsum info bigger that blocksize"); return (size); } static void prepare_blockgrouped_file(uint32_t block) { struct nandfs_block_group_desc *desc; uint32_t i, entries; desc = (struct nandfs_block_group_desc *)get_block(block, 0); entries = blocksize / sizeof(struct nandfs_block_group_desc); for (i = 0; i < entries; i++) desc[i].bg_nfrees = blocksize * 8; } static void alloc_blockgrouped_file(uint32_t block, uint32_t entry) { struct nandfs_block_group_desc *desc; uint32_t desc_nr; uint32_t *bitmap; desc = (struct nandfs_block_group_desc *)get_block(block, 0); bitmap = (uint32_t *)get_block(block + 1, 1); bitmap += (entry >> 5); if (*bitmap & (1 << (entry % 32))) { printf("nandfs: blockgrouped entry %d already allocated\n", entry); } *bitmap |= (1 << (entry % 32)); desc_nr = entry / (blocksize * 8); desc[desc_nr].bg_nfrees--; } static uint64_t count_su_blocks(void) { uint64_t maxblk, blk, offset, i; maxblk = blk = 0; for (i = 0; i < bad_segments_count; i++) { nandfs_seg_usage_blk_offset(bad_segments[i], &blk, &offset); debug("bad segment at block:%jx off: %jx", blk, offset); if (blk > maxblk) maxblk = blk; } debug("bad segment needs %#jx", blk); - if (blk >= NDADDR) { - printf("nandfs: file too big (%jd > %d)\n", blk, NDADDR); + if (blk >= NANDFS_NDADDR) { + printf("nandfs: file too big (%jd > %d)\n", blk, NANDFS_NDADDR); exit(2); } sufile.size = (blk + 1) * blocksize; return (blk + 1); } static void count_seg_blocks(void) { uint32_t i; for (i = 0; i < nuserfiles; i++) if (user_files[i].nblocks) { seg_nblocks += user_files[i].nblocks; user_files[i].blocks = malloc(user_files[i].nblocks * sizeof(uint32_t)); } ifile.nblocks = 2 + SIZE_TO_BLOCK(sizeof(struct nandfs_inode) * (NANDFS_USER_INO + 1)); ifile.blocks = malloc(ifile.nblocks * sizeof(uint32_t)); seg_nblocks += ifile.nblocks; cpfile.nblocks = SIZE_TO_BLOCK((NANDFS_CPFILE_FIRST_CHECKPOINT_OFFSET + 1) * sizeof(struct nandfs_checkpoint)); cpfile.blocks = malloc(cpfile.nblocks * sizeof(uint32_t)); seg_nblocks += cpfile.nblocks; if (!bad_segments) { sufile.nblocks = SIZE_TO_BLOCK((NANDFS_SUFILE_FIRST_SEGMENT_USAGE_OFFSET + 1) * sizeof(struct nandfs_segment_usage)); } else { debug("bad blocks found: extra space for sufile"); sufile.nblocks = count_su_blocks(); } sufile.blocks = malloc(sufile.nblocks * sizeof(uint32_t)); seg_nblocks += sufile.nblocks; datfile.nblocks = 2 + SIZE_TO_BLOCK((seg_nblocks) * sizeof(struct nandfs_dat_entry)); datfile.blocks = malloc(datfile.nblocks * sizeof(uint32_t)); seg_nblocks += datfile.nblocks; } static void assign_file_blocks(uint64_t start_block) { uint32_t i, j; for (i = 0; i < nuserfiles; i++) for (j = 0; j < user_files[i].nblocks; j++) { debug("user file %d at block %d at %#jx", i, j, (uintmax_t)start_block); user_files[i].blocks[j] = start_block++; } for (j = 0; j < ifile.nblocks; j++) { debug("ifile block %d at %#jx", j, (uintmax_t)start_block); ifile.blocks[j] = start_block++; } for (j = 0; j < cpfile.nblocks; j++) { debug("cpfile block %d at %#jx", j, (uintmax_t)start_block); cpfile.blocks[j] = start_block++; } for (j = 0; j < sufile.nblocks; j++) { debug("sufile block %d at %#jx", j, (uintmax_t)start_block); sufile.blocks[j] = start_block++; } for (j = 0; j < datfile.nblocks; j++) { debug("datfile block %d at %#jx", j, (uintmax_t)start_block); datfile.blocks[j] = start_block++; } /* add one for superroot */ debug("sr at block %#jx", (uintmax_t)start_block); sr = (struct nandfs_super_root *)get_block(start_block++, 0); seg_endblock = start_block; } static void save_datfile(void) { prepare_blockgrouped_file(datfile.blocks[0]); } static uint64_t update_datfile(uint64_t block) { struct nandfs_dat_entry *dat; static uint64_t vblock = 0; uint64_t allocated, i, off; if (vblock == 0) { alloc_blockgrouped_file(datfile.blocks[0], vblock); vblock++; } allocated = vblock; i = vblock / (blocksize / sizeof(*dat)); off = vblock % (blocksize / sizeof(*dat)); vblock++; dat = (struct nandfs_dat_entry *)get_block(datfile.blocks[2 + i], 2 + i); alloc_blockgrouped_file(datfile.blocks[0], allocated); dat[off].de_blocknr = block; dat[off].de_start = NANDFS_FIRST_CNO; dat[off].de_end = UINTMAX_MAX; return (allocated); } static union nandfs_binfo * update_block_info(union nandfs_binfo *binfo, struct file_info *file) { nandfs_daddr_t vblock; uint32_t i; for (i = 0; i < file->nblocks; i++) { debug("%s: blk %x", __func__, i); if (file->ino != NANDFS_DAT_INO) { vblock = update_datfile(file->blocks[i]); binfo->bi_v.bi_vblocknr = vblock; binfo->bi_v.bi_blkoff = i; binfo->bi_v.bi_ino = file->ino; file->inode->i_db[i] = vblock; } else { binfo->bi_dat.bi_blkoff = i; binfo->bi_dat.bi_ino = file->ino; file->inode->i_db[i] = datfile.blocks[i]; } binfo++; } return (binfo); } static void save_segsum(struct nandfs_segment_summary *ss) { union nandfs_binfo *binfo; struct nandfs_block *block; uint32_t sum_bytes, i; uint8_t crc_data, crc_skip; sum_bytes = segment_size(); ss->ss_magic = NANDFS_SEGSUM_MAGIC; ss->ss_bytes = sizeof(struct nandfs_segment_summary); ss->ss_flags = NANDFS_SS_LOGBGN | NANDFS_SS_LOGEND | NANDFS_SS_SR; ss->ss_seq = 1; ss->ss_create = nandfs_time; ss->ss_next = nandfs_first_block() + blocks_per_segment; /* nblocks = segment blocks + segsum block + superroot */ ss->ss_nblocks = seg_nblocks + 2; ss->ss_nbinfos = seg_nblocks; ss->ss_sumbytes = sum_bytes; crc_skip = sizeof(ss->ss_datasum) + sizeof(ss->ss_sumsum); ss->ss_sumsum = crc32_le(0, (uint8_t *)ss + crc_skip, sum_bytes - crc_skip); crc_data = 0; binfo = (union nandfs_binfo *)(ss + 1); for (i = 0; i < nuserfiles; i++) { if (user_files[i].nblocks) binfo = update_block_info(binfo, &user_files[i]); } binfo = update_block_info(binfo, &ifile); binfo = update_block_info(binfo, &cpfile); binfo = update_block_info(binfo, &sufile); update_block_info(binfo, &datfile); /* save superroot crc */ crc_skip = sizeof(sr->sr_sum); sr->sr_sum = crc32_le(0, (uint8_t *)sr + crc_skip, NANDFS_SR_BYTES - crc_skip); /* segment checksup */ crc_skip = sizeof(ss->ss_datasum); LIST_FOREACH(block, &block_head, block_link) { if (block->number < NANDFS_FIRST_BLOCK) continue; if (block->number == NANDFS_FIRST_BLOCK) crc_data = crc32_le(0, (uint8_t *)block->data + crc_skip, blocksize - crc_skip); else crc_data = crc32_le(crc_data, (uint8_t *)block->data, blocksize); } ss->ss_datasum = crc_data; } static void create_fsdata(void) { memset(&fsdata, 0, sizeof(struct nandfs_fsdata)); fsdata.f_magic = NANDFS_FSDATA_MAGIC; fsdata.f_nsegments = nsegments; fsdata.f_erasesize = erasesize; fsdata.f_first_data_block = NANDFS_FIRST_BLOCK; fsdata.f_blocks_per_segment = blocks_per_segment; fsdata.f_r_segments_percentage = rsv_segment_percent; fsdata.f_rev_level = NANDFS_CURRENT_REV; fsdata.f_sbbytes = NANDFS_SB_BYTES; fsdata.f_bytes = NANDFS_FSDATA_CRC_BYTES; fsdata.f_ctime = nandfs_time; fsdata.f_log_block_size = nandfs_log2(blocksize) - 10; fsdata.f_errors = 1; fsdata.f_inode_size = sizeof(struct nandfs_inode); fsdata.f_dat_entry_size = sizeof(struct nandfs_dat_entry); fsdata.f_checkpoint_size = sizeof(struct nandfs_checkpoint); fsdata.f_segment_usage_size = sizeof(struct nandfs_segment_usage); uuidgen(&fsdata.f_uuid, 1); if (volumelabel) memcpy(fsdata.f_volume_name, volumelabel, 16); fsdata.f_sum = crc32_le(0, (const uint8_t *)&fsdata, NANDFS_FSDATA_CRC_BYTES); } static void save_fsdata(void *data) { memcpy(data, &fsdata, sizeof(fsdata)); } static void create_super_block(void) { memset(&super_block, 0, sizeof(struct nandfs_super_block)); super_block.s_magic = NANDFS_SUPER_MAGIC; super_block.s_last_cno = NANDFS_FIRST_CNO; super_block.s_last_pseg = NANDFS_FIRST_BLOCK; super_block.s_last_seq = 1; super_block.s_free_blocks_count = (nsegments - bad_segments_count) * blocks_per_segment; super_block.s_mtime = 0; super_block.s_wtime = nandfs_time; super_block.s_state = NANDFS_VALID_FS; super_block.s_sum = crc32_le(0, (const uint8_t *)&super_block, NANDFS_SB_BYTES); } static void save_super_block(void *data) { memcpy(data, &super_block, sizeof(super_block)); } static void save_super_root(void) { sr->sr_bytes = NANDFS_SR_BYTES; sr->sr_flags = 0; sr->sr_nongc_ctime = nandfs_time; datfile.inode = &sr->sr_dat; cpfile.inode = &sr->sr_cpfile; sufile.inode = &sr->sr_sufile; } static struct nandfs_dir_entry * add_de(void *block, struct nandfs_dir_entry *de, uint64_t ino, const char *name, uint8_t type) { uint16_t reclen; /* modify last de */ de->rec_len = NANDFS_DIR_REC_LEN(de->name_len); de = (void *)((uint8_t *)de + de->rec_len); reclen = blocksize - ((uintptr_t)de - (uintptr_t)block); if (reclen < NANDFS_DIR_REC_LEN(strlen(name))) { printf("nandfs: too many dir entries for one block\n"); return (NULL); } de->inode = ino; de->rec_len = reclen; de->name_len = strlen(name); de->file_type = type; memset(de->name, 0, (strlen(name) + NANDFS_DIR_PAD - 1) & ~NANDFS_DIR_ROUND); memcpy(de->name, name, strlen(name)); return (de); } static struct nandfs_dir_entry * make_dir(void *block, uint64_t ino, uint64_t parent_ino) { struct nandfs_dir_entry *de = (struct nandfs_dir_entry *)block; /* create '..' entry */ de->inode = parent_ino; de->rec_len = NANDFS_DIR_REC_LEN(2); de->name_len = 2; de->file_type = DT_DIR; memset(de->name, 0, NANDFS_DIR_NAME_LEN(2)); memcpy(de->name, "..", 2); /* create '.' entry */ de = (void *)((uint8_t *)block + NANDFS_DIR_REC_LEN(2)); de->inode = ino; de->rec_len = blocksize - NANDFS_DIR_REC_LEN(2); de->name_len = 1; de->file_type = DT_DIR; memset(de->name, 0, NANDFS_DIR_NAME_LEN(1)); memcpy(de->name, ".", 1); return (de); } static void save_root_dir(void) { struct file_info *root = &user_files[0]; struct nandfs_dir_entry *de; uint32_t i; void *block; block = get_block(root->blocks[0], 0); de = make_dir(block, root->ino, root->ino); for (i = 1; i < nuserfiles; i++) de = add_de(block, de, user_files[i].ino, user_files[i].name, IFTODT(user_files[i].mode)); root->size = ((uintptr_t)de - (uintptr_t)block) + NANDFS_DIR_REC_LEN(de->name_len); } static void save_sufile(void) { struct nandfs_sufile_header *header; struct nandfs_segment_usage *su; uint64_t blk, i, off; void *block; int start; /* * At the beginning just zero-out everything */ for (i = 0; i < sufile.nblocks; i++) get_block(sufile.blocks[i], 0); start = 0; block = get_block(sufile.blocks[start], 0); header = (struct nandfs_sufile_header *)block; header->sh_ncleansegs = nsegments - bad_segments_count - 1; header->sh_ndirtysegs = 1; header->sh_last_alloc = 1; su = (struct nandfs_segment_usage *)header; off = NANDFS_SUFILE_FIRST_SEGMENT_USAGE_OFFSET; /* Allocate data segment */ su[off].su_lastmod = nandfs_time; /* nblocks = segment blocks + segsum block + superroot */ su[off].su_nblocks = seg_nblocks + 2; su[off].su_flags = NANDFS_SEGMENT_USAGE_DIRTY; off++; /* Allocate next segment */ su[off].su_lastmod = nandfs_time; su[off].su_nblocks = 0; su[off].su_flags = NANDFS_SEGMENT_USAGE_DIRTY; for (i = 0; i < bad_segments_count; i++) { nandfs_seg_usage_blk_offset(bad_segments[i], &blk, &off); debug("storing bad_segments[%jd]=%x at %jx off %jx\n", i, bad_segments[i], blk, off); block = get_block(sufile.blocks[blk], off * sizeof(struct nandfs_segment_usage *)); su = (struct nandfs_segment_usage *)block; su[off].su_lastmod = nandfs_time; su[off].su_nblocks = 0; su[off].su_flags = NANDFS_SEGMENT_USAGE_ERROR; } } static void save_cpfile(void) { struct nandfs_cpfile_header *header; struct nandfs_checkpoint *cp, *initial_cp; int i, entries = blocksize / sizeof(struct nandfs_checkpoint); uint64_t cno; header = (struct nandfs_cpfile_header *)get_block(cpfile.blocks[0], 0); header->ch_ncheckpoints = 1; header->ch_nsnapshots = 0; cp = (struct nandfs_checkpoint *)header; /* fill first checkpoint data*/ initial_cp = &cp[NANDFS_CPFILE_FIRST_CHECKPOINT_OFFSET]; initial_cp->cp_flags = 0; initial_cp->cp_checkpoints_count = 0; initial_cp->cp_cno = NANDFS_FIRST_CNO; initial_cp->cp_create = nandfs_time; initial_cp->cp_nblk_inc = seg_endblock - 1; initial_cp->cp_blocks_count = seg_nblocks; memset(&initial_cp->cp_snapshot_list, 0, sizeof(struct nandfs_snapshot_list)); ifile.inode = &initial_cp->cp_ifile_inode; /* mark rest of cp as invalid */ cno = NANDFS_FIRST_CNO + 1; i = NANDFS_CPFILE_FIRST_CHECKPOINT_OFFSET + 1; for (; i < entries; i++) { cp[i].cp_cno = cno++; cp[i].cp_flags = NANDFS_CHECKPOINT_INVALID; } } static void init_inode(struct nandfs_inode *inode, struct file_info *file) { inode->i_blocks = file->nblocks; inode->i_ctime = nandfs_time; inode->i_mtime = nandfs_time; inode->i_mode = file->mode & 0xffff; inode->i_links_count = 1; if (file->size > 0) inode->i_size = file->size; else inode->i_size = 0; if (file->ino == NANDFS_USER_INO) inode->i_flags = SF_NOUNLINK|UF_NOUNLINK; else inode->i_flags = 0; } static void save_ifile(void) { struct nandfs_inode *inode; struct file_info *file; uint64_t ino, blk, off; uint32_t i; prepare_blockgrouped_file(ifile.blocks[0]); for (i = 0; i <= NANDFS_USER_INO; i++) alloc_blockgrouped_file(ifile.blocks[0], i); for (i = 0; i < nuserfiles; i++) { file = &user_files[i]; ino = file->ino; blk = ino / (blocksize / sizeof(*inode)); off = ino % (blocksize / sizeof(*inode)); inode = (struct nandfs_inode *)get_block(ifile.blocks[2 + blk], 2 + blk); file->inode = &inode[off]; init_inode(file->inode, file); } init_inode(ifile.inode, &ifile); init_inode(cpfile.inode, &cpfile); init_inode(sufile.inode, &sufile); init_inode(datfile.inode, &datfile); } static int create_fs(void) { uint64_t start_block; uint32_t segsum_size; char *data; int i; nuserfiles = nitems(user_files); /* Count and assign blocks */ count_seg_blocks(); segsum_size = segment_size(); start_block = NANDFS_FIRST_BLOCK + SIZE_TO_BLOCK(segsum_size); assign_file_blocks(start_block); /* Create super root structure */ save_super_root(); /* Create root directory */ save_root_dir(); /* Fill in file contents */ save_sufile(); save_cpfile(); save_ifile(); save_datfile(); /* Save fsdata and superblocks */ create_fsdata(); create_super_block(); for (i = 0; i < NANDFS_NFSAREAS; i++) { if (fsdata_blocks_state[i] != NANDFS_BLOCK_GOOD) continue; data = get_block((i * erasesize)/blocksize, 0); save_fsdata(data); data = get_block((i * erasesize + NANDFS_SBLOCK_OFFSET_BYTES) / blocksize, 0); if (blocksize > NANDFS_SBLOCK_OFFSET_BYTES) data += NANDFS_SBLOCK_OFFSET_BYTES; save_super_block(data); memset(data + sizeof(struct nandfs_super_block), 0xff, (blocksize - sizeof(struct nandfs_super_block) - NANDFS_SBLOCK_OFFSET_BYTES)); } /* Save segment summary and CRCs */ save_segsum(get_block(NANDFS_FIRST_BLOCK, 0)); return (0); } static void write_fs(int fda) { struct nandfs_block *block; char *data; u_int ret; /* Overwrite next block with ff if not nand device */ if (!is_nand) { data = get_block(seg_endblock, 0); memset(data, 0xff, blocksize); } LIST_FOREACH(block, &block_head, block_link) { lseek(fda, block->number * blocksize, SEEK_SET); ret = write(fda, block->data, blocksize); if (ret != blocksize) err(1, "cannot write filesystem data"); } } static void check_parameters(void) { int i; /* check blocksize */ if ((blocksize < NANDFS_MIN_BLOCKSIZE) || (blocksize > MAXBSIZE) || ((blocksize - 1) & blocksize)) { errx(1, "Bad blocksize (%zu). Must be in range [%u-%u] " "and a power of two.", blocksize, NANDFS_MIN_BLOCKSIZE, MAXBSIZE); } /* check blocks per segments */ if ((blocks_per_segment < NANDFS_SEG_MIN_BLOCKS) || ((blocksize - 1) & blocksize)) errx(1, "Bad blocks per segment (%lu). Must be greater than " "%u and a power of two.", blocks_per_segment, NANDFS_SEG_MIN_BLOCKS); /* check reserved segment percentage */ if ((rsv_segment_percent < 1) || (rsv_segment_percent > 99)) errx(1, "Bad reserved segment percentage. " "Must in range 1..99."); /* check volume label */ i = 0; if (volumelabel) { while (isalnum(volumelabel[++i])) ; if (volumelabel[i] != '\0') { errx(1, "bad volume label. " "Valid characters are alphanumerics."); } if (strlen(volumelabel) >= 16) errx(1, "Bad volume label. Length is longer than %d.", 16); } nandfs_time = time(NULL); } static void print_parameters(void) { printf("filesystem parameters:\n"); printf("blocksize: %#zx sectorsize: %#zx\n", blocksize, sectorsize); printf("erasesize: %#jx mediasize: %#jx\n", erasesize, mediasize); printf("segment size: %#jx blocks per segment: %#x\n", segsize, (uint32_t)blocks_per_segment); } /* * Exit with error if file system is mounted. */ static void check_mounted(const char *fname, mode_t mode) { struct statfs *mp; const char *s1, *s2; size_t len; int n, r; if (!(n = getmntinfo(&mp, MNT_NOWAIT))) err(1, "getmntinfo"); len = strlen(_PATH_DEV); s1 = fname; if (!strncmp(s1, _PATH_DEV, len)) s1 += len; r = S_ISCHR(mode) && s1 != fname && *s1 == 'r'; for (; n--; mp++) { s2 = mp->f_mntfromname; if (!strncmp(s2, _PATH_DEV, len)) s2 += len; if ((r && s2 != mp->f_mntfromname && !strcmp(s1 + 1, s2)) || !strcmp(s1, s2)) errx(1, "%s is mounted on %s", fname, mp->f_mntonname); } } static void calculate_geometry(int fd) { struct chip_param_io chip_params; char ident[DISK_IDENT_SIZE]; char medianame[MAXPATHLEN]; /* Check storage type */ g_get_ident(fd, ident, DISK_IDENT_SIZE); g_get_name(ident, medianame, MAXPATHLEN); debug("device name: %s", medianame); is_nand = (strstr(medianame, "gnand") != NULL); debug("is_nand = %d", is_nand); sectorsize = g_sectorsize(fd); debug("sectorsize: %#zx", sectorsize); /* Get storage size */ mediasize = g_mediasize(fd); debug("mediasize: %#jx", mediasize); /* Get storage erase unit size */ if (!is_nand) erasesize = NANDFS_DEF_ERASESIZE; else if (ioctl(fd, NAND_IO_GET_CHIP_PARAM, &chip_params) != -1) erasesize = chip_params.page_size * chip_params.pages_per_block; else errx(1, "Cannot ioctl(NAND_IO_GET_CHIP_PARAM)"); debug("erasesize: %#jx", (uintmax_t)erasesize); if (blocks_per_segment == 0) { if (erasesize >= NANDFS_MIN_SEGSIZE) blocks_per_segment = erasesize / blocksize; else blocks_per_segment = NANDFS_MIN_SEGSIZE / blocksize; } /* Calculate number of segments */ segsize = blocksize * blocks_per_segment; nsegments = ((mediasize - NANDFS_NFSAREAS * erasesize) / segsize) - 2; debug("segsize: %#jx", segsize); debug("nsegments: %#jx", nsegments); } static void erase_device(int fd) { int rest, failed; uint64_t i, nblocks; off_t offset; failed = 0; for (i = 0; i < NANDFS_NFSAREAS; i++) { debug("Deleting %jx\n", i * erasesize); if (g_delete(fd, i * erasesize, erasesize)) { printf("cannot delete %jx\n", i * erasesize); fsdata_blocks_state[i] = NANDFS_BLOCK_BAD; failed++; } else fsdata_blocks_state[i] = NANDFS_BLOCK_GOOD; } if (failed == NANDFS_NFSAREAS) { printf("%d first blocks not usable. Unable to create " "filesystem.\n", failed); exit(1); } for (i = 0; i < nsegments; i++) { offset = NANDFS_NFSAREAS * erasesize + i * segsize; if (g_delete(fd, offset, segsize)) { printf("cannot delete segment %jx (offset %jd)\n", i, offset); bad_segments_count++; bad_segments = realloc(bad_segments, bad_segments_count * sizeof(uint32_t)); bad_segments[bad_segments_count - 1] = i; } } if (bad_segments_count == nsegments) { printf("no valid segments\n"); exit(1); } /* Delete remaining blocks at the end of device */ rest = mediasize % segsize; nblocks = rest / erasesize; for (i = 0; i < nblocks; i++) { offset = (segsize * nsegments) + (i * erasesize); if (g_delete(fd, offset, erasesize)) { printf("cannot delete space after last segment " "- probably a bad block\n"); } } } static void erase_initial(int fd) { char buf[512]; u_int i; memset(buf, 0xff, sizeof(buf)); lseek(fd, 0, SEEK_SET); for (i = 0; i < NANDFS_NFSAREAS * erasesize; i += sizeof(buf)) write(fd, buf, sizeof(buf)); } static void create_nandfs(int fd) { create_fs(); write_fs(fd); } static void print_summary(void) { printf("filesystem was created successfully\n"); printf("total segments: %#jx valid segments: %#jx\n", nsegments, nsegments - bad_segments_count); printf("total space: %ju MB free: %ju MB\n", (nsegments * blocks_per_segment * blocksize) / (1024 * 1024), ((nsegments - bad_segments_count) * blocks_per_segment * blocksize) / (1024 * 1024)); } int main(int argc, char *argv[]) { struct stat sb; char buf[MAXPATHLEN]; const char opts[] = "b:B:L:m:"; const char *fname; int ch, fd; while ((ch = getopt(argc, argv, opts)) != -1) { switch (ch) { case 'b': blocksize = strtol(optarg, (char **)NULL, 10); if (blocksize == 0) usage(); break; case 'B': blocks_per_segment = strtol(optarg, (char **)NULL, 10); if (blocks_per_segment == 0) usage(); break; case 'L': volumelabel = optarg; break; case 'm': rsv_segment_percent = strtol(optarg, (char **)NULL, 10); if (rsv_segment_percent == 0) usage(); break; default: usage(); } } argc -= optind; argv += optind; if (argc < 1 || argc > 2) usage(); /* construct proper device path */ fname = *argv++; if (!strchr(fname, '/')) { snprintf(buf, sizeof(buf), "%s%s", _PATH_DEV, fname); if (!(fname = strdup(buf))) err(1, NULL); } fd = g_open(fname, 1); if (fd == -1) err(1, "Cannot open %s", fname); if (fstat(fd, &sb) == -1) err(1, "Cannot stat %s", fname); if (!S_ISCHR(sb.st_mode)) warnx("%s is not a character device", fname); check_mounted(fname, sb.st_mode); calculate_geometry(fd); check_parameters(); print_parameters(); if (is_nand) erase_device(fd); else erase_initial(fd); create_nandfs(fd); print_summary(); g_close(fd); return (0); } Index: head/sbin/quotacheck/quotacheck.c =================================================================== --- head/sbin/quotacheck/quotacheck.c (revision 313779) +++ head/sbin/quotacheck/quotacheck.c (revision 313780) @@ -1,725 +1,726 @@ /* * Copyright (c) 1980, 1990, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * Robert Elz at The University of Melbourne. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #if 0 #ifndef lint static const char copyright[] = "@(#) Copyright (c) 1980, 1990, 1993\n\ The Regents of the University of California. All rights reserved.\n"; #endif /* not lint */ #ifndef lint static char sccsid[] = "@(#)quotacheck.c 8.3 (Berkeley) 1/29/94"; #endif /* not lint */ #endif #include __FBSDID("$FreeBSD$"); /* * Fix up / report on disk quotas & usage */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "quotacheck.h" char *qfname = QUOTAFILENAME; char *qfextension[] = INITQFNAMES; char *quotagroup = QUOTAGROUP; union { struct fs sblk; char dummy[MAXBSIZE]; } sb_un; #define sblock sb_un.sblk union { struct cg cgblk; char dummy[MAXBSIZE]; } cg_un; #define cgblk cg_un.cgblk long dev_bsize = 1; ino_t maxino; union dinode { struct ufs1_dinode dp1; struct ufs2_dinode dp2; }; #define DIP(dp, field) \ ((sblock.fs_magic == FS_UFS1_MAGIC) ? \ (dp)->dp1.field : (dp)->dp2.field) #define HASUSR 1 #define HASGRP 2 struct fileusage { struct fileusage *fu_next; u_long fu_curinodes; u_long fu_curblocks; u_long fu_id; char fu_name[1]; /* actually bigger */ }; #define FUHASH 1024 /* must be power of two */ struct fileusage *fuhead[MAXQUOTAS][FUHASH]; int aflag; /* all file systems */ int cflag; /* convert format to 32 or 64 bit size */ int gflag; /* check group quotas */ int uflag; /* check user quotas */ int vflag; /* verbose */ int fi; /* open disk file descriptor */ struct fileusage * addid(u_long, int, char *, const char *); void bread(ufs2_daddr_t, char *, long); void freeinodebuf(void); union dinode * getnextinode(ino_t); int getquotagid(void); struct fileusage * lookup(u_long, int); int oneof(char *, char*[], int); void printchanges(const char *, int, struct dqblk *, struct fileusage *, u_long); void setinodebuf(ino_t); int update(const char *, struct quotafile *, int); void usage(void); int main(int argc, char *argv[]) { struct fstab *fs; struct passwd *pw; struct group *gr; struct quotafile *qfu, *qfg; int i, argnum, maxrun, errs, ch; long done = 0; char *name; errs = maxrun = 0; while ((ch = getopt(argc, argv, "ac:guvl:")) != -1) { switch(ch) { case 'a': aflag++; break; case 'c': if (cflag) usage(); cflag = atoi(optarg); break; case 'g': gflag++; break; case 'u': uflag++; break; case 'v': vflag++; break; case 'l': maxrun = atoi(optarg); break; default: usage(); } } argc -= optind; argv += optind; if ((argc == 0 && !aflag) || (argc > 0 && aflag)) usage(); if (cflag && cflag != 32 && cflag != 64) usage(); if (!gflag && !uflag) { gflag++; uflag++; } if (gflag) { setgrent(); while ((gr = getgrent()) != NULL) (void) addid((u_long)gr->gr_gid, GRPQUOTA, gr->gr_name, NULL); endgrent(); } if (uflag) { setpwent(); while ((pw = getpwent()) != NULL) (void) addid((u_long)pw->pw_uid, USRQUOTA, pw->pw_name, NULL); endpwent(); } /* * The maxrun (-l) option is now deprecated. */ if (maxrun > 0) warnx("the -l option is now deprecated"); if (aflag) exit(checkfstab(uflag, gflag)); if (setfsent() == 0) errx(1, "%s: can't open", FSTAB); while ((fs = getfsent()) != NULL) { if (((argnum = oneof(fs->fs_file, argv, argc)) >= 0 || (argnum = oneof(fs->fs_spec, argv, argc)) >= 0) && (name = blockcheck(fs->fs_spec))) { done |= 1 << argnum; qfu = NULL; if (uflag) qfu = quota_open(fs, USRQUOTA, O_CREAT|O_RDWR); qfg = NULL; if (gflag) qfg = quota_open(fs, GRPQUOTA, O_CREAT|O_RDWR); if (qfu == NULL && qfg == NULL) continue; errs += chkquota(name, qfu, qfg); if (qfu) quota_close(qfu); if (qfg) quota_close(qfg); } } endfsent(); for (i = 0; i < argc; i++) if ((done & (1 << i)) == 0) fprintf(stderr, "%s not found in %s\n", argv[i], FSTAB); exit(errs); } void usage(void) { (void)fprintf(stderr, "%s\n%s\n", "usage: quotacheck [-guv] [-c 32 | 64] [-l maxrun] -a", " quotacheck [-guv] [-c 32 | 64] filesystem ..."); exit(1); } /* * Possible superblock locations ordered from most to least likely. */ static int sblock_try[] = SBLOCKSEARCH; /* * Scan the specified file system to check quota(s) present on it. */ int chkquota(char *specname, struct quotafile *qfu, struct quotafile *qfg) { struct fileusage *fup; union dinode *dp; int cg, i, mode, errs = 0; ino_t ino, inosused, userino = 0, groupino = 0; dev_t dev, userdev = 0, groupdev = 0; struct stat sb; const char *mntpt; char *cp; if (qfu != NULL) mntpt = quota_fsname(qfu); else if (qfg != NULL) mntpt = quota_fsname(qfg); else errx(1, "null quotafile information passed to chkquota()\n"); if (cflag) { if (vflag && qfu != NULL) printf("%s: convert user quota to %d bits\n", mntpt, cflag); if (qfu != NULL && quota_convert(qfu, cflag) < 0) { if (errno == EBADF) errx(1, "%s: cannot convert an active quota file", mntpt); err(1, "user quota conversion to size %d failed", cflag); } if (vflag && qfg != NULL) printf("%s: convert group quota to %d bits\n", mntpt, cflag); if (qfg != NULL && quota_convert(qfg, cflag) < 0) { if (errno == EBADF) errx(1, "%s: cannot convert an active quota file", mntpt); err(1, "group quota conversion to size %d failed", cflag); } } if ((fi = open(specname, O_RDONLY, 0)) < 0) { warn("%s", specname); return (1); } if ((stat(mntpt, &sb)) < 0) { warn("%s", mntpt); return (1); } dev = sb.st_dev; if (vflag) { (void)printf("*** Checking "); if (qfu) (void)printf("user%s", qfg ? " and " : ""); if (qfg) (void)printf("group"); (void)printf(" quotas for %s (%s)\n", specname, mntpt); } if (qfu) { if (stat(quota_qfname(qfu), &sb) == 0) { userino = sb.st_ino; userdev = sb.st_dev; } } if (qfg) { if (stat(quota_qfname(qfg), &sb) == 0) { groupino = sb.st_ino; groupdev = sb.st_dev; } } sync(); dev_bsize = 1; for (i = 0; sblock_try[i] != -1; i++) { bread(sblock_try[i], (char *)&sblock, (long)SBLOCKSIZE); if ((sblock.fs_magic == FS_UFS1_MAGIC || (sblock.fs_magic == FS_UFS2_MAGIC && sblock.fs_sblockloc == sblock_try[i])) && sblock.fs_bsize <= MAXBSIZE && sblock.fs_bsize >= sizeof(struct fs)) break; } if (sblock_try[i] == -1) { warn("Cannot find file system superblock"); return (1); } dev_bsize = sblock.fs_fsize / fsbtodb(&sblock, 1); maxino = sblock.fs_ncg * sblock.fs_ipg; for (cg = 0; cg < sblock.fs_ncg; cg++) { ino = cg * sblock.fs_ipg; setinodebuf(ino); bread(fsbtodb(&sblock, cgtod(&sblock, cg)), (char *)(&cgblk), sblock.fs_cgsize); if (sblock.fs_magic == FS_UFS2_MAGIC) inosused = cgblk.cg_initediblk; else inosused = sblock.fs_ipg; /* * If we are using soft updates, then we can trust the * cylinder group inode allocation maps to tell us which * inodes are allocated. We will scan the used inode map * to find the inodes that are really in use, and then * read only those inodes in from disk. */ if (sblock.fs_flags & FS_DOSOFTDEP) { if (!cg_chkmagic(&cgblk)) errx(1, "CG %d: BAD MAGIC NUMBER\n", cg); cp = &cg_inosused(&cgblk)[(inosused - 1) / CHAR_BIT]; for ( ; inosused > 0; inosused -= CHAR_BIT, cp--) { if (*cp == 0) continue; for (i = 1 << (CHAR_BIT - 1); i > 0; i >>= 1) { if (*cp & i) break; inosused--; } break; } if (inosused <= 0) continue; } for (i = 0; i < inosused; i++, ino++) { - if ((dp = getnextinode(ino)) == NULL || ino < ROOTINO || + if ((dp = getnextinode(ino)) == NULL || + ino < UFS_ROOTINO || (mode = DIP(dp, di_mode) & IFMT) == 0) continue; /* * XXX: Do not account for UIDs or GIDs that appear * to be negative to prevent generating 100GB+ * quota files. */ if ((int)DIP(dp, di_uid) < 0 || (int)DIP(dp, di_gid) < 0) { if (vflag) { if (aflag) (void)printf("%s: ", mntpt); (void)printf("out of range UID/GID (%u/%u) ino=%ju\n", DIP(dp, di_uid), DIP(dp,di_gid), (uintmax_t)ino); } continue; } /* * Do not account for file system snapshot files * or the actual quota data files to be consistent * with how they are handled inside the kernel. */ #ifdef SF_SNAPSHOT if (DIP(dp, di_flags) & SF_SNAPSHOT) continue; #endif if ((ino == userino && dev == userdev) || (ino == groupino && dev == groupdev)) continue; if (qfg) { fup = addid((u_long)DIP(dp, di_gid), GRPQUOTA, (char *)0, mntpt); fup->fu_curinodes++; if (mode == IFREG || mode == IFDIR || mode == IFLNK) fup->fu_curblocks += DIP(dp, di_blocks); } if (qfu) { fup = addid((u_long)DIP(dp, di_uid), USRQUOTA, (char *)0, mntpt); fup->fu_curinodes++; if (mode == IFREG || mode == IFDIR || mode == IFLNK) fup->fu_curblocks += DIP(dp, di_blocks); } } } freeinodebuf(); if (qfu) errs += update(mntpt, qfu, USRQUOTA); if (qfg) errs += update(mntpt, qfg, GRPQUOTA); close(fi); (void)fflush(stdout); return (errs); } /* * Update a specified quota file. */ int update(const char *fsname, struct quotafile *qf, int type) { struct fileusage *fup; u_long id, lastid, highid = 0; struct dqblk dqbuf; struct stat sb; static struct dqblk zerodqbuf; static struct fileusage zerofileusage; /* * Scan the on-disk quota file and record any usage changes. */ lastid = quota_maxid(qf); for (id = 0; id <= lastid; id++) { if (quota_read(qf, &dqbuf, id) < 0) dqbuf = zerodqbuf; if ((fup = lookup(id, type)) == NULL) fup = &zerofileusage; if (fup->fu_curinodes || fup->fu_curblocks || dqbuf.dqb_bsoftlimit || dqbuf.dqb_bhardlimit || dqbuf.dqb_isoftlimit || dqbuf.dqb_ihardlimit) highid = id; if (dqbuf.dqb_curinodes == fup->fu_curinodes && dqbuf.dqb_curblocks == fup->fu_curblocks) { fup->fu_curinodes = 0; fup->fu_curblocks = 0; continue; } printchanges(fsname, type, &dqbuf, fup, id); dqbuf.dqb_curinodes = fup->fu_curinodes; dqbuf.dqb_curblocks = fup->fu_curblocks; (void) quota_write_usage(qf, &dqbuf, id); fup->fu_curinodes = 0; fup->fu_curblocks = 0; } /* * Walk the hash table looking for ids with non-zero usage * that are not currently recorded in the quota file. E.g. * ids that are past the end of the current file. */ for (id = 0; id < FUHASH; id++) { for (fup = fuhead[type][id]; fup != NULL; fup = fup->fu_next) { if (fup->fu_id <= lastid) continue; if (fup->fu_curinodes == 0 && fup->fu_curblocks == 0) continue; bzero(&dqbuf, sizeof(struct dqblk)); if (fup->fu_id > highid) highid = fup->fu_id; printchanges(fsname, type, &dqbuf, fup, fup->fu_id); dqbuf.dqb_curinodes = fup->fu_curinodes; dqbuf.dqb_curblocks = fup->fu_curblocks; (void) quota_write_usage(qf, &dqbuf, fup->fu_id); fup->fu_curinodes = 0; fup->fu_curblocks = 0; } } /* * If this is old format file, then size may be smaller, * so ensure that we only truncate when it will make things * smaller, and not if it will grow an old format file. */ if (highid < lastid && stat(quota_qfname(qf), &sb) == 0 && sb.st_size > (((off_t)highid + 2) * sizeof(struct dqblk))) truncate(quota_qfname(qf), (((off_t)highid + 2) * sizeof(struct dqblk))); return (0); } /* * Check to see if target appears in list of size cnt. */ int oneof(char *target, char *list[], int cnt) { int i; for (i = 0; i < cnt; i++) if (strcmp(target, list[i]) == 0) return (i); return (-1); } /* * Determine the group identifier for quota files. */ int getquotagid(void) { struct group *gr; if ((gr = getgrnam(quotagroup)) != NULL) return (gr->gr_gid); return (-1); } /* * Routines to manage the file usage table. * * Lookup an id of a specific type. */ struct fileusage * lookup(u_long id, int type) { struct fileusage *fup; for (fup = fuhead[type][id & (FUHASH-1)]; fup != NULL; fup = fup->fu_next) if (fup->fu_id == id) return (fup); return (NULL); } /* * Add a new file usage id if it does not already exist. */ struct fileusage * addid(u_long id, int type, char *name, const char *fsname) { struct fileusage *fup, **fhp; int len; if ((fup = lookup(id, type)) != NULL) return (fup); if (name) len = strlen(name); else len = 0; if ((fup = calloc(1, sizeof(*fup) + len)) == NULL) errx(1, "calloc failed"); fhp = &fuhead[type][id & (FUHASH - 1)]; fup->fu_next = *fhp; *fhp = fup; fup->fu_id = id; if (name) bcopy(name, fup->fu_name, len + 1); else { (void)sprintf(fup->fu_name, "%lu", id); if (vflag) { if (aflag && fsname != NULL) (void)printf("%s: ", fsname); printf("unknown %cid: %lu\n", type == USRQUOTA ? 'u' : 'g', id); } } return (fup); } /* * Special purpose version of ginode used to optimize pass * over all the inodes in numerical order. */ static ino_t nextino, lastinum, lastvalidinum; static long readcnt, readpercg, fullcnt, inobufsize, partialcnt, partialsize; static caddr_t inodebuf; #define INOBUFSIZE 56*1024 /* size of buffer to read inodes */ union dinode * getnextinode(ino_t inumber) { long size; ufs2_daddr_t dblk; union dinode *dp; static caddr_t nextinop; if (inumber != nextino++ || inumber > lastvalidinum) errx(1, "bad inode number %ju to nextinode", (uintmax_t)inumber); if (inumber >= lastinum) { readcnt++; dblk = fsbtodb(&sblock, ino_to_fsba(&sblock, lastinum)); if (readcnt % readpercg == 0) { size = partialsize; lastinum += partialcnt; } else { size = inobufsize; lastinum += fullcnt; } /* * If bread returns an error, it will already have zeroed * out the buffer, so we do not need to do so here. */ bread(dblk, inodebuf, size); nextinop = inodebuf; } dp = (union dinode *)nextinop; if (sblock.fs_magic == FS_UFS1_MAGIC) nextinop += sizeof(struct ufs1_dinode); else nextinop += sizeof(struct ufs2_dinode); return (dp); } /* * Prepare to scan a set of inodes. */ void setinodebuf(ino_t inum) { if (inum % sblock.fs_ipg != 0) errx(1, "bad inode number %ju to setinodebuf", (uintmax_t)inum); lastvalidinum = inum + sblock.fs_ipg - 1; nextino = inum; lastinum = inum; readcnt = 0; if (inodebuf != NULL) return; inobufsize = blkroundup(&sblock, INOBUFSIZE); fullcnt = inobufsize / ((sblock.fs_magic == FS_UFS1_MAGIC) ? sizeof(struct ufs1_dinode) : sizeof(struct ufs2_dinode)); readpercg = sblock.fs_ipg / fullcnt; partialcnt = sblock.fs_ipg % fullcnt; partialsize = partialcnt * ((sblock.fs_magic == FS_UFS1_MAGIC) ? sizeof(struct ufs1_dinode) : sizeof(struct ufs2_dinode)); if (partialcnt != 0) { readpercg++; } else { partialcnt = fullcnt; partialsize = inobufsize; } if ((inodebuf = malloc((unsigned)inobufsize)) == NULL) errx(1, "cannot allocate space for inode buffer"); } /* * Free up data structures used to scan inodes. */ void freeinodebuf(void) { if (inodebuf != NULL) free(inodebuf); inodebuf = NULL; } /* * Read specified disk blocks. */ void bread(ufs2_daddr_t bno, char *buf, long cnt) { if (lseek(fi, (off_t)bno * dev_bsize, SEEK_SET) < 0 || read(fi, buf, cnt) != cnt) errx(1, "bread failed on block %ld", (long)bno); } /* * Display updated block and i-node counts. */ void printchanges(const char *fsname, int type, struct dqblk *dp, struct fileusage *fup, u_long id) { if (!vflag) return; if (aflag) (void)printf("%s: ", fsname); if (fup->fu_name[0] == '\0') (void)printf("%-8lu fixed ", id); else (void)printf("%-8s fixed ", fup->fu_name); switch (type) { case GRPQUOTA: (void)printf("(group):"); break; case USRQUOTA: (void)printf("(user): "); break; default: (void)printf("(unknown quota type %d)", type); break; } if (dp->dqb_curinodes != fup->fu_curinodes) (void)printf("\tinodes %lu -> %lu", (u_long)dp->dqb_curinodes, (u_long)fup->fu_curinodes); if (dp->dqb_curblocks != fup->fu_curblocks) (void)printf("\tblocks %lu -> %lu", (u_long)dp->dqb_curblocks, (u_long)fup->fu_curblocks); (void)printf("\n"); } Index: head/sbin/restore/dirs.c =================================================================== --- head/sbin/restore/dirs.c (revision 313779) +++ head/sbin/restore/dirs.c (revision 313780) @@ -1,817 +1,817 @@ /* * Copyright (c) 1983, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #ifndef lint #if 0 static char sccsid[] = "@(#)dirs.c 8.7 (Berkeley) 5/1/95"; #endif static const char rcsid[] = "$FreeBSD$"; #endif /* not lint */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "restore.h" #include "extern.h" /* * Symbol table of directories read from tape. */ #define HASHSIZE 1000 #define INOHASH(val) (val % HASHSIZE) struct inotab { struct inotab *t_next; ino_t t_ino; int32_t t_seekpt; int32_t t_size; }; static struct inotab *inotab[HASHSIZE]; /* * Information retained about directories. */ struct modeinfo { ino_t ino; struct timespec ctimep[2]; struct timespec mtimep[2]; mode_t mode; uid_t uid; gid_t gid; u_int flags; int extsize; }; /* * Definitions for library routines operating on directories. */ #undef DIRBLKSIZ #define DIRBLKSIZ 1024 struct rstdirdesc { int dd_fd; int32_t dd_loc; int32_t dd_size; char dd_buf[DIRBLKSIZ]; }; /* * Global variables for this file. */ static long seekpt; static FILE *df, *mf; static RST_DIR *dirp; static char dirfile[MAXPATHLEN] = "#"; /* No file */ static char modefile[MAXPATHLEN] = "#"; /* No file */ static char dot[2] = "."; /* So it can be modified */ static struct inotab *allocinotab(struct context *, long); static void flushent(void); static struct inotab *inotablookup(ino_t); static RST_DIR *opendirfile(const char *); static void putdir(char *, size_t); static void putdirattrs(char *, size_t); static void putent(struct direct *); static void rst_seekdir(RST_DIR *, long, long); static long rst_telldir(RST_DIR *); static struct direct *searchdir(ino_t, char *); static void fail_dirtmp(char *); /* * Extract directory contents, building up a directory structure * on disk for extraction by name. * If genmode is requested, save mode, owner, and times for all * directories on the tape. */ void extractdirs(int genmode) { struct inotab *itp; struct direct nulldir; int i, fd; const char *tmpdir; vprintf(stdout, "Extract directories from tape\n"); if ((tmpdir = getenv("TMPDIR")) == NULL || tmpdir[0] == '\0') tmpdir = _PATH_TMP; (void) sprintf(dirfile, "%s/rstdir%jd", tmpdir, (intmax_t)dumpdate); if (command != 'r' && command != 'R') { (void) strcat(dirfile, "-XXXXXX"); fd = mkstemp(dirfile); } else fd = open(dirfile, O_RDWR|O_CREAT|O_EXCL, 0666); if (fd == -1 || (df = fdopen(fd, "w")) == NULL) { if (fd != -1) close(fd); warn("%s: cannot create directory database", dirfile); done(1); } if (genmode != 0) { (void) sprintf(modefile, "%s/rstmode%jd", tmpdir, (intmax_t)dumpdate); if (command != 'r' && command != 'R') { (void) strcat(modefile, "-XXXXXX"); fd = mkstemp(modefile); } else fd = open(modefile, O_RDWR|O_CREAT|O_EXCL, 0666); if (fd == -1 || (mf = fdopen(fd, "w")) == NULL) { if (fd != -1) close(fd); warn("%s: cannot create modefile", modefile); done(1); } } nulldir.d_ino = 0; nulldir.d_type = DT_DIR; nulldir.d_namlen = 1; (void) strcpy(nulldir.d_name, "/"); nulldir.d_reclen = DIRSIZ(0, &nulldir); for (;;) { curfile.name = ""; curfile.action = USING; if (curfile.mode == 0 || (curfile.mode & IFMT) != IFDIR) break; itp = allocinotab(&curfile, seekpt); getfile(putdir, putdirattrs, xtrnull); putent(&nulldir); flushent(); itp->t_size = seekpt - itp->t_seekpt; } if (fclose(df) != 0) fail_dirtmp(dirfile); dirp = opendirfile(dirfile); if (dirp == NULL) fprintf(stderr, "opendirfile: %s\n", strerror(errno)); if (mf != NULL && fclose(mf) != 0) fail_dirtmp(modefile); i = dirlookup(dot); if (i == 0) panic("Root directory is not on tape\n"); } /* * skip over all the directories on the tape */ void skipdirs(void) { while (curfile.ino && (curfile.mode & IFMT) == IFDIR) { skipfile(); } } /* * Recursively find names and inumbers of all files in subtree * pname and pass them off to be processed. */ void treescan(char *pname, ino_t ino, long (*todo)(char *, ino_t, int)) { struct inotab *itp; struct direct *dp; int namelen; long bpt; char locname[MAXPATHLEN]; itp = inotablookup(ino); if (itp == NULL) { /* * Pname is name of a simple file or an unchanged directory. */ (void) (*todo)(pname, ino, LEAF); return; } /* * Pname is a dumped directory name. */ if ((*todo)(pname, ino, NODE) == FAIL) return; /* * begin search through the directory * skipping over "." and ".." */ (void) strlcpy(locname, pname, sizeof(locname)); (void) strlcat(locname, "/", sizeof(locname)); namelen = strlen(locname); rst_seekdir(dirp, itp->t_seekpt, itp->t_seekpt); dp = rst_readdir(dirp); /* "." */ if (dp != NULL && strcmp(dp->d_name, ".") == 0) dp = rst_readdir(dirp); /* ".." */ else fprintf(stderr, "Warning: `.' missing from directory %s\n", pname); if (dp != NULL && strcmp(dp->d_name, "..") == 0) dp = rst_readdir(dirp); /* first real entry */ else fprintf(stderr, "Warning: `..' missing from directory %s\n", pname); bpt = rst_telldir(dirp); /* * a zero inode signals end of directory */ while (dp != NULL) { locname[namelen] = '\0'; if (namelen + dp->d_namlen >= sizeof(locname)) { fprintf(stderr, "%s%s: name exceeds %zu char\n", locname, dp->d_name, sizeof(locname) - 1); } else { (void)strlcat(locname, dp->d_name, sizeof(locname)); treescan(locname, dp->d_ino, todo); rst_seekdir(dirp, bpt, itp->t_seekpt); } dp = rst_readdir(dirp); bpt = rst_telldir(dirp); } } /* - * Lookup a pathname which is always assumed to start from the ROOTINO. + * Lookup a pathname which is always assumed to start from the UFS_ROOTINO. */ struct direct * pathsearch(const char *pathname) { ino_t ino; struct direct *dp; char *path, *name, buffer[MAXPATHLEN]; strcpy(buffer, pathname); path = buffer; - ino = ROOTINO; + ino = UFS_ROOTINO; while (*path == '/') path++; dp = NULL; while ((name = strsep(&path, "/")) != NULL && *name != '\0') { if ((dp = searchdir(ino, name)) == NULL) return (NULL); ino = dp->d_ino; } return (dp); } /* * Lookup the requested name in directory inum. * Return its inode number if found, zero if it does not exist. */ static struct direct * searchdir(ino_t inum, char *name) { struct direct *dp; struct inotab *itp; int len; itp = inotablookup(inum); if (itp == NULL) return (NULL); rst_seekdir(dirp, itp->t_seekpt, itp->t_seekpt); len = strlen(name); do { dp = rst_readdir(dirp); if (dp == NULL) return (NULL); } while (dp->d_namlen != len || strncmp(dp->d_name, name, len) != 0); return (dp); } /* * Put the directory entries in the directory file */ static void putdir(char *buf, size_t size) { struct direct *dp; size_t loc, i; for (loc = 0; loc < size; ) { dp = (struct direct *)(buf + loc); if (Bcvt) swabst((u_char *)"ls", (u_char *) dp); if (oldinofmt && dp->d_ino != 0) { #if BYTE_ORDER == BIG_ENDIAN if (Bcvt) dp->d_namlen = dp->d_type; #else if (!Bcvt && dp->d_namlen == 0) dp->d_namlen = dp->d_type; #endif dp->d_type = DT_UNKNOWN; } i = DIRBLKSIZ - (loc & (DIRBLKSIZ - 1)); if ((dp->d_reclen & 0x3) != 0 || dp->d_reclen > i || dp->d_reclen < DIRSIZ(0, dp) #if NAME_MAX < 255 || dp->d_namlen > NAME_MAX #endif ) { vprintf(stdout, "Mangled directory: "); if ((dp->d_reclen & 0x3) != 0) vprintf(stdout, "reclen not multiple of 4 "); if (dp->d_reclen < DIRSIZ(0, dp)) vprintf(stdout, "reclen less than DIRSIZ (%u < %zu) ", dp->d_reclen, DIRSIZ(0, dp)); #if NAME_MAX < 255 if (dp->d_namlen > NAME_MAX) vprintf(stdout, "reclen name too big (%u > %u) ", dp->d_namlen, NAME_MAX); #endif vprintf(stdout, "\n"); loc += i; continue; } loc += dp->d_reclen; if (dp->d_ino != 0) { putent(dp); } } } /* * These variables are "local" to the following two functions. */ char dirbuf[DIRBLKSIZ]; long dirloc = 0; long prev = 0; /* * add a new directory entry to a file. */ static void putent(struct direct *dp) { dp->d_reclen = DIRSIZ(0, dp); if (dirloc + dp->d_reclen > DIRBLKSIZ) { ((struct direct *)(dirbuf + prev))->d_reclen = DIRBLKSIZ - prev; if (fwrite(dirbuf, DIRBLKSIZ, 1, df) != 1) fail_dirtmp(dirfile); dirloc = 0; } memmove(dirbuf + dirloc, dp, (long)dp->d_reclen); prev = dirloc; dirloc += dp->d_reclen; } /* * flush out a directory that is finished. */ static void flushent(void) { ((struct direct *)(dirbuf + prev))->d_reclen = DIRBLKSIZ - prev; if (fwrite(dirbuf, (int)dirloc, 1, df) != 1) fail_dirtmp(dirfile); seekpt = ftell(df); dirloc = 0; } /* * Save extended attributes for a directory entry to a file. */ static void putdirattrs(char *buf, size_t size) { if (mf != NULL && fwrite(buf, size, 1, mf) != 1) fail_dirtmp(modefile); } /* * Seek to an entry in a directory. * Only values returned by rst_telldir should be passed to rst_seekdir. * This routine handles many directories in a single file. * It takes the base of the directory in the file, plus * the desired seek offset into it. */ static void rst_seekdir(RST_DIR *dirp, long loc, long base) { if (loc == rst_telldir(dirp)) return; loc -= base; if (loc < 0) fprintf(stderr, "bad seek pointer to rst_seekdir %ld\n", loc); (void) lseek(dirp->dd_fd, base + rounddown2(loc, DIRBLKSIZ), SEEK_SET); dirp->dd_loc = loc & (DIRBLKSIZ - 1); if (dirp->dd_loc != 0) dirp->dd_size = read(dirp->dd_fd, dirp->dd_buf, DIRBLKSIZ); } /* * get next entry in a directory. */ struct direct * rst_readdir(RST_DIR *dirp) { struct direct *dp; for (;;) { if (dirp->dd_loc == 0) { dirp->dd_size = read(dirp->dd_fd, dirp->dd_buf, DIRBLKSIZ); if (dirp->dd_size <= 0) { dprintf(stderr, "error reading directory\n"); return (NULL); } } if (dirp->dd_loc >= dirp->dd_size) { dirp->dd_loc = 0; continue; } dp = (struct direct *)(dirp->dd_buf + dirp->dd_loc); if (dp->d_reclen == 0 || dp->d_reclen > DIRBLKSIZ + 1 - dirp->dd_loc) { dprintf(stderr, "corrupted directory: bad reclen %d\n", dp->d_reclen); return (NULL); } dirp->dd_loc += dp->d_reclen; if (dp->d_ino == 0 && strcmp(dp->d_name, "/") == 0) return (NULL); if (dp->d_ino >= maxino) { dprintf(stderr, "corrupted directory: bad inum %d\n", dp->d_ino); continue; } return (dp); } } /* * Simulate the opening of a directory */ void * rst_opendir(const char *name) { struct inotab *itp; RST_DIR *dirp; ino_t ino; if ((ino = dirlookup(name)) > 0 && (itp = inotablookup(ino)) != NULL) { dirp = opendirfile(dirfile); rst_seekdir(dirp, itp->t_seekpt, itp->t_seekpt); return (dirp); } return (NULL); } /* * In our case, there is nothing to do when closing a directory. */ void rst_closedir(void *arg) { RST_DIR *dirp; dirp = arg; (void)close(dirp->dd_fd); free(dirp); return; } /* * Simulate finding the current offset in the directory. */ static long rst_telldir(RST_DIR *dirp) { return ((long)lseek(dirp->dd_fd, (off_t)0, SEEK_CUR) - dirp->dd_size + dirp->dd_loc); } /* * Open a directory file. */ static RST_DIR * opendirfile(const char *name) { RST_DIR *dirp; int fd; if ((fd = open(name, O_RDONLY)) == -1) return (NULL); if ((dirp = malloc(sizeof(RST_DIR))) == NULL) { (void)close(fd); return (NULL); } dirp->dd_fd = fd; dirp->dd_loc = 0; return (dirp); } /* * Set the mode, owner, and times for all new or changed directories */ void setdirmodes(int flags) { FILE *mf; struct modeinfo node; struct entry *ep; char *cp, *buf; const char *tmpdir; int bufsize; uid_t myuid; vprintf(stdout, "Set directory mode, owner, and times.\n"); if ((tmpdir = getenv("TMPDIR")) == NULL || tmpdir[0] == '\0') tmpdir = _PATH_TMP; if (command == 'r' || command == 'R') (void) sprintf(modefile, "%s/rstmode%jd", tmpdir, (intmax_t)dumpdate); if (modefile[0] == '#') { panic("modefile not defined\n"); fprintf(stderr, "directory mode, owner, and times not set\n"); return; } mf = fopen(modefile, "r"); if (mf == NULL) { fprintf(stderr, "fopen: %s\n", strerror(errno)); fprintf(stderr, "cannot open mode file %s\n", modefile); fprintf(stderr, "directory mode, owner, and times not set\n"); return; } clearerr(mf); bufsize = 0; myuid = getuid(); for (;;) { (void) fread((char *)&node, 1, sizeof(struct modeinfo), mf); if (ferror(mf)) { warn("%s: cannot read modefile.", modefile); fprintf(stderr, "Mode, owner, and times not set.\n"); break; } if (feof(mf)) break; if (node.extsize > 0) { if (bufsize < node.extsize) { if (bufsize > 0) free(buf); if ((buf = malloc(node.extsize)) != NULL) { bufsize = node.extsize; } else { bufsize = 0; } } if (bufsize >= node.extsize) { (void) fread(buf, 1, node.extsize, mf); if (ferror(mf)) { warn("%s: cannot read modefile.", modefile); fprintf(stderr, "Not all external "); fprintf(stderr, "attributes set.\n"); break; } } else { (void) fseek(mf, node.extsize, SEEK_CUR); if (ferror(mf)) { warn("%s: cannot seek in modefile.", modefile); fprintf(stderr, "Not all directory "); fprintf(stderr, "attributes set.\n"); break; } } } ep = lookupino(node.ino); if (command == 'i' || command == 'x') { if (ep == NULL) continue; if ((flags & FORCE) == 0 && ep->e_flags & EXISTED) { ep->e_flags &= ~NEW; continue; } - if (node.ino == ROOTINO && + if (node.ino == UFS_ROOTINO && reply("set owner/mode for '.'") == FAIL) continue; } if (ep == NULL) { panic("cannot find directory inode %ju\n", (uintmax_t)node.ino); continue; } cp = myname(ep); if (!Nflag) { if (node.extsize > 0) { if (bufsize >= node.extsize) { set_extattr(-1, cp, buf, node.extsize, SXA_FILE); } else { fprintf(stderr, "Cannot restore %s%s\n", "extended attributes for ", cp); } } if (myuid != 0) (void) chown(cp, myuid, node.gid); else (void) chown(cp, node.uid, node.gid); (void) chmod(cp, node.mode); utimensat(AT_FDCWD, cp, node.ctimep, 0); utimensat(AT_FDCWD, cp, node.mtimep, 0); (void) chflags(cp, node.flags); } ep->e_flags &= ~NEW; } if (bufsize > 0) free(buf); (void) fclose(mf); } /* * Generate a literal copy of a directory. */ int genliteraldir(char *name, ino_t ino) { struct inotab *itp; int ofile, dp, i, size; char buf[BUFSIZ]; itp = inotablookup(ino); if (itp == NULL) panic("Cannot find directory inode %ju named %s\n", (uintmax_t)ino, name); if ((ofile = open(name, O_WRONLY | O_CREAT | O_TRUNC, 0666)) < 0) { fprintf(stderr, "%s: ", name); (void) fflush(stderr); fprintf(stderr, "cannot create file: %s\n", strerror(errno)); return (FAIL); } rst_seekdir(dirp, itp->t_seekpt, itp->t_seekpt); dp = dup(dirp->dd_fd); for (i = itp->t_size; i > 0; i -= BUFSIZ) { size = MIN(i, BUFSIZ); if (read(dp, buf, (int) size) == -1) { fprintf(stderr, "write error extracting inode %ju, name %s\n", (uintmax_t)curfile.ino, curfile.name); fprintf(stderr, "read: %s\n", strerror(errno)); done(1); } if (!Nflag && write(ofile, buf, (int) size) == -1) { fprintf(stderr, "write error extracting inode %ju, name %s\n", (uintmax_t)curfile.ino, curfile.name); fprintf(stderr, "write: %s\n", strerror(errno)); done(1); } } (void) close(dp); (void) close(ofile); return (GOOD); } /* * Determine the type of an inode */ int inodetype(ino_t ino) { struct inotab *itp; itp = inotablookup(ino); if (itp == NULL) return (LEAF); return (NODE); } /* * Allocate and initialize a directory inode entry. * If requested, save its pertinent mode, owner, and time info. */ static struct inotab * allocinotab(struct context *ctxp, long seekpt) { struct inotab *itp; struct modeinfo node; itp = calloc(1, sizeof(struct inotab)); if (itp == NULL) panic("no memory for directory table\n"); itp->t_next = inotab[INOHASH(ctxp->ino)]; inotab[INOHASH(ctxp->ino)] = itp; itp->t_ino = ctxp->ino; itp->t_seekpt = seekpt; if (mf == NULL) return (itp); node.ino = ctxp->ino; node.mtimep[0].tv_sec = ctxp->atime_sec; node.mtimep[0].tv_nsec = ctxp->atime_nsec; node.mtimep[1].tv_sec = ctxp->mtime_sec; node.mtimep[1].tv_nsec = ctxp->mtime_nsec; node.ctimep[0].tv_sec = ctxp->atime_sec; node.ctimep[0].tv_nsec = ctxp->atime_nsec; node.ctimep[1].tv_sec = ctxp->birthtime_sec; node.ctimep[1].tv_nsec = ctxp->birthtime_nsec; node.extsize = ctxp->extsize; node.mode = ctxp->mode; node.flags = ctxp->file_flags; node.uid = ctxp->uid; node.gid = ctxp->gid; if (fwrite((char *)&node, sizeof(struct modeinfo), 1, mf) != 1) fail_dirtmp(modefile); return (itp); } /* * Look up an inode in the table of directories */ static struct inotab * inotablookup(ino_t ino) { struct inotab *itp; for (itp = inotab[INOHASH(ino)]; itp != NULL; itp = itp->t_next) if (itp->t_ino == ino) return (itp); return (NULL); } /* * Clean up and exit */ void done(int exitcode) { closemt(); if (modefile[0] != '#') { (void) truncate(modefile, 0); (void) unlink(modefile); } if (dirfile[0] != '#') { (void) truncate(dirfile, 0); (void) unlink(dirfile); } exit(exitcode); } /* * Print out information about the failure to save directory, * extended attribute, and mode information. */ static void fail_dirtmp(char *filename) { const char *tmpdir; warn("%s: cannot write directory database", filename); if (errno == ENOSPC) { if ((tmpdir = getenv("TMPDIR")) == NULL || tmpdir[0] == '\0') tmpdir = _PATH_TMP; fprintf(stderr, "Try making space in %s, %s\n%s\n", tmpdir, "or set environment variable TMPDIR", "to an alternate location with more disk space."); } done(1); } Index: head/sbin/restore/interactive.c =================================================================== --- head/sbin/restore/interactive.c (revision 313779) +++ head/sbin/restore/interactive.c (revision 313780) @@ -1,768 +1,768 @@ /* * Copyright (c) 1985, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #ifndef lint #if 0 static char sccsid[] = "@(#)interactive.c 8.5 (Berkeley) 5/1/95"; #endif #endif /* not lint */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include "restore.h" #include "extern.h" /* * Things to handle interruptions. */ static int runshell; static jmp_buf reset; static char *nextarg = NULL; /* * Structure and routines associated with listing directories. */ struct afile { ino_t fnum; /* inode number of file */ char *fname; /* file name */ short len; /* name length */ char prefix; /* prefix character */ char postfix; /* postfix character */ }; struct arglist { int freeglob; /* glob structure needs to be freed */ int argcnt; /* next globbed argument to return */ glob_t glob; /* globbing information */ char *cmd; /* the current command */ }; static char *copynext(char *, char *); static int fcmp(const void *, const void *); static void formatf(struct afile *, int); static void getcmd(char *, char *, char *, size_t, struct arglist *); struct dirent *glob_readdir(void *); static int glob_stat(const char *, struct stat *); static void mkentry(char *, struct direct *, struct afile *); static void printlist(char *, char *); /* * Read and execute commands from the terminal. */ void runcmdshell(void) { struct entry *np; ino_t ino; struct arglist arglist; char curdir[MAXPATHLEN]; char name[MAXPATHLEN]; char cmd[BUFSIZ]; arglist.freeglob = 0; arglist.argcnt = 0; arglist.glob.gl_flags = GLOB_ALTDIRFUNC; arglist.glob.gl_opendir = rst_opendir; arglist.glob.gl_readdir = glob_readdir; arglist.glob.gl_closedir = rst_closedir; arglist.glob.gl_lstat = glob_stat; arglist.glob.gl_stat = glob_stat; canon("/", curdir, sizeof(curdir)); loop: if (setjmp(reset) != 0) { if (arglist.freeglob != 0) { arglist.freeglob = 0; arglist.argcnt = 0; globfree(&arglist.glob); } nextarg = NULL; volno = 0; } runshell = 1; getcmd(curdir, cmd, name, sizeof(name), &arglist); switch (cmd[0]) { /* * Add elements to the extraction list. */ case 'a': if (strncmp(cmd, "add", strlen(cmd)) != 0) goto bad; ino = dirlookup(name); if (ino == 0) break; if (mflag) pathcheck(name); treescan(name, ino, addfile); break; /* * Change working directory. */ case 'c': if (strncmp(cmd, "cd", strlen(cmd)) != 0) goto bad; ino = dirlookup(name); if (ino == 0) break; if (inodetype(ino) == LEAF) { fprintf(stderr, "%s: not a directory\n", name); break; } (void) strcpy(curdir, name); break; /* * Delete elements from the extraction list. */ case 'd': if (strncmp(cmd, "delete", strlen(cmd)) != 0) goto bad; np = lookupname(name); if (np == NULL || (np->e_flags & NEW) == 0) { fprintf(stderr, "%s: not on extraction list\n", name); break; } treescan(name, np->e_ino, deletefile); break; /* * Extract the requested list. */ case 'e': if (strncmp(cmd, "extract", strlen(cmd)) != 0) goto bad; createfiles(); createlinks(); setdirmodes(0); if (dflag) checkrestore(); volno = 0; break; /* * List available commands. */ case 'h': if (strncmp(cmd, "help", strlen(cmd)) != 0) goto bad; case '?': fprintf(stderr, "%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s", "Available commands are:\n", "\tls [arg] - list directory\n", "\tcd arg - change directory\n", "\tpwd - print current directory\n", "\tadd [arg] - add `arg' to list of", " files to be extracted\n", "\tdelete [arg] - delete `arg' from", " list of files to be extracted\n", "\textract - extract requested files\n", "\tsetmodes - set modes of requested directories\n", "\tquit - immediately exit program\n", "\twhat - list dump header information\n", "\tverbose - toggle verbose flag", " (useful with ``ls'')\n", "\thelp or `?' - print this list\n", "If no `arg' is supplied, the current", " directory is used\n"); break; /* * List a directory. */ case 'l': if (strncmp(cmd, "ls", strlen(cmd)) != 0) goto bad; printlist(name, curdir); break; /* * Print current directory. */ case 'p': if (strncmp(cmd, "pwd", strlen(cmd)) != 0) goto bad; if (curdir[1] == '\0') fprintf(stderr, "/\n"); else fprintf(stderr, "%s\n", &curdir[1]); break; /* * Quit. */ case 'q': if (strncmp(cmd, "quit", strlen(cmd)) != 0) goto bad; return; case 'x': if (strncmp(cmd, "xit", strlen(cmd)) != 0) goto bad; return; /* * Toggle verbose mode. */ case 'v': if (strncmp(cmd, "verbose", strlen(cmd)) != 0) goto bad; if (vflag) { fprintf(stderr, "verbose mode off\n"); vflag = 0; break; } fprintf(stderr, "verbose mode on\n"); vflag++; break; /* * Just restore requested directory modes. */ case 's': if (strncmp(cmd, "setmodes", strlen(cmd)) != 0) goto bad; setdirmodes(FORCE); break; /* * Print out dump header information. */ case 'w': if (strncmp(cmd, "what", strlen(cmd)) != 0) goto bad; printdumpinfo(); break; /* * Turn on debugging. */ case 'D': if (strncmp(cmd, "Debug", strlen(cmd)) != 0) goto bad; if (dflag) { fprintf(stderr, "debugging mode off\n"); dflag = 0; break; } fprintf(stderr, "debugging mode on\n"); dflag++; break; /* * Unknown command. */ default: bad: fprintf(stderr, "%s: unknown command; type ? for help\n", cmd); break; } goto loop; } /* * Read and parse an interactive command. * The first word on the line is assigned to "cmd". If * there are no arguments on the command line, then "curdir" * is returned as the argument. If there are arguments * on the line they are returned one at a time on each * successive call to getcmd. Each argument is first assigned * to "name". If it does not start with "/" the pathname in * "curdir" is prepended to it. Finally "canon" is called to * eliminate any embedded ".." components. */ static void getcmd(char *curdir, char *cmd, char *name, size_t size, struct arglist *ap) { char *cp; static char input[BUFSIZ]; char output[BUFSIZ]; # define rawname input /* save space by reusing input buffer */ /* * Check to see if still processing arguments. */ if (ap->argcnt > 0) goto retnext; if (nextarg != NULL) goto getnext; /* * Read a command line and trim off trailing white space. */ do { fprintf(stderr, "restore > "); (void) fflush(stderr); if (fgets(input, BUFSIZ, terminal) == NULL) { strcpy(cmd, "quit"); return; } } while (input[0] == '\n'); for (cp = &input[strlen(input) - 2]; *cp == ' ' || *cp == '\t'; cp--) /* trim off trailing white space and newline */; *++cp = '\0'; /* * Copy the command into "cmd". */ cp = copynext(input, cmd); ap->cmd = cmd; /* * If no argument, use curdir as the default. */ if (*cp == '\0') { (void) strncpy(name, curdir, size); name[size - 1] = '\0'; return; } nextarg = cp; /* * Find the next argument. */ getnext: cp = copynext(nextarg, rawname); if (*cp == '\0') nextarg = NULL; else nextarg = cp; /* * If it is an absolute pathname, canonicalize it and return it. */ if (rawname[0] == '/') { canon(rawname, name, size); } else { /* * For relative pathnames, prepend the current directory to * it then canonicalize and return it. */ snprintf(output, sizeof(output), "%s/%s", curdir, rawname); canon(output, name, size); } switch (glob(name, GLOB_ALTDIRFUNC, NULL, &ap->glob)) { case GLOB_NOSPACE: fprintf(stderr, "%s: out of memory\n", ap->cmd); break; case GLOB_NOMATCH: fprintf(stderr, "%s %s: no such file or directory\n", ap->cmd, name); break; } if (ap->glob.gl_pathc == 0) return; ap->freeglob = 1; ap->argcnt = ap->glob.gl_pathc; retnext: strncpy(name, ap->glob.gl_pathv[ap->glob.gl_pathc - ap->argcnt], size); name[size - 1] = '\0'; if (--ap->argcnt == 0) { ap->freeglob = 0; globfree(&ap->glob); } # undef rawname } /* * Strip off the next token of the input. */ static char * copynext(char *input, char *output) { char *cp, *bp; char quote; for (cp = input; *cp == ' ' || *cp == '\t'; cp++) /* skip to argument */; bp = output; while (*cp != ' ' && *cp != '\t' && *cp != '\0') { /* * Handle back slashes. */ if (*cp == '\\') { if (*++cp == '\0') { fprintf(stderr, "command lines cannot be continued\n"); continue; } *bp++ = *cp++; continue; } /* * The usual unquoted case. */ if (*cp != '\'' && *cp != '"') { *bp++ = *cp++; continue; } /* * Handle single and double quotes. */ quote = *cp++; while (*cp != quote && *cp != '\0') *bp++ = *cp++; if (*cp++ == '\0') { fprintf(stderr, "missing %c\n", quote); cp--; continue; } } *bp = '\0'; return (cp); } /* * Canonicalize file names to always start with ``./'' and * remove any embedded "." and ".." components. */ void canon(char *rawname, char *canonname, size_t len) { char *cp, *np; if (strcmp(rawname, ".") == 0 || strncmp(rawname, "./", 2) == 0) (void) strcpy(canonname, ""); else if (rawname[0] == '/') (void) strcpy(canonname, "."); else (void) strcpy(canonname, "./"); if (strlen(canonname) + strlen(rawname) >= len) { fprintf(stderr, "canonname: not enough buffer space\n"); done(1); } (void) strcat(canonname, rawname); /* * Eliminate multiple and trailing '/'s */ for (cp = np = canonname; *np != '\0'; cp++) { *cp = *np++; while (*cp == '/' && *np == '/') np++; } *cp = '\0'; if (*--cp == '/') *cp = '\0'; /* * Eliminate extraneous "." and ".." from pathnames. */ for (np = canonname; *np != '\0'; ) { np++; cp = np; while (*np != '/' && *np != '\0') np++; if (np - cp == 1 && *cp == '.') { cp--; (void) strcpy(cp, np); np = cp; } if (np - cp == 2 && strncmp(cp, "..", 2) == 0) { cp--; while (cp > &canonname[1] && *--cp != '/') /* find beginning of name */; (void) strcpy(cp, np); np = cp; } } } /* * Do an "ls" style listing of a directory */ static void printlist(char *name, char *basename) { struct afile *fp, *list, *listp; struct direct *dp; struct afile single; RST_DIR *dirp; int entries, len, namelen; char locname[MAXPATHLEN]; dp = pathsearch(name); if (dp == NULL || (!dflag && TSTINO(dp->d_ino, dumpmap) == 0) || - (!vflag && dp->d_ino == WINO)) + (!vflag && dp->d_ino == UFS_WINO)) return; if ((dirp = rst_opendir(name)) == NULL) { entries = 1; list = &single; mkentry(name, dp, list); len = strlen(basename) + 1; if (strlen(name) - len > single.len) { freename(single.fname); single.fname = savename(&name[len]); single.len = strlen(single.fname); } } else { entries = 0; while ((dp = rst_readdir(dirp))) entries++; rst_closedir(dirp); list = (struct afile *)malloc(entries * sizeof(struct afile)); if (list == NULL) { fprintf(stderr, "ls: out of memory\n"); return; } if ((dirp = rst_opendir(name)) == NULL) panic("directory reopen failed\n"); fprintf(stderr, "%s:\n", name); entries = 0; listp = list; (void)strlcpy(locname, name, MAXPATHLEN); (void)strlcat(locname, "/", MAXPATHLEN); namelen = strlen(locname); while ((dp = rst_readdir(dirp))) { if (dp == NULL) break; if (!dflag && TSTINO(dp->d_ino, dumpmap) == 0) continue; - if (!vflag && (dp->d_ino == WINO || + if (!vflag && (dp->d_ino == UFS_WINO || strcmp(dp->d_name, ".") == 0 || strcmp(dp->d_name, "..") == 0)) continue; locname[namelen] = '\0'; if (namelen + dp->d_namlen >= MAXPATHLEN) { fprintf(stderr, "%s%s: name exceeds %d char\n", locname, dp->d_name, MAXPATHLEN); } else { (void)strlcat(locname, dp->d_name, MAXPATHLEN); mkentry(locname, dp, listp++); entries++; } } rst_closedir(dirp); if (entries == 0) { fprintf(stderr, "\n"); free(list); return; } qsort((char *)list, entries, sizeof(struct afile), fcmp); } formatf(list, entries); if (dirp != NULL) { for (fp = listp - 1; fp >= list; fp--) freename(fp->fname); fprintf(stderr, "\n"); free(list); } } /* * Read the contents of a directory. */ static void mkentry(char *name, struct direct *dp, struct afile *fp) { char *cp; struct entry *np; fp->fnum = dp->d_ino; fp->fname = savename(dp->d_name); for (cp = fp->fname; *cp; cp++) if (!vflag && !isprint((unsigned char)*cp)) *cp = '?'; fp->len = cp - fp->fname; if (dflag && TSTINO(fp->fnum, dumpmap) == 0) fp->prefix = '^'; else if ((np = lookupname(name)) != NULL && (np->e_flags & NEW)) fp->prefix = '*'; else fp->prefix = ' '; switch(dp->d_type) { default: fprintf(stderr, "Warning: undefined file type %d\n", dp->d_type); /* FALLTHROUGH */ case DT_REG: fp->postfix = ' '; break; case DT_LNK: fp->postfix = '@'; break; case DT_FIFO: case DT_SOCK: fp->postfix = '='; break; case DT_CHR: case DT_BLK: fp->postfix = '#'; break; case DT_WHT: fp->postfix = '%'; break; case DT_UNKNOWN: case DT_DIR: if (inodetype(dp->d_ino) == NODE) fp->postfix = '/'; else fp->postfix = ' '; break; } return; } /* * Print out a pretty listing of a directory */ static void formatf(struct afile *list, int nentry) { struct afile *fp, *endlist; int width, bigino, haveprefix, havepostfix; int i, j, w, precision, columns, lines; width = 0; haveprefix = 0; havepostfix = 0; - bigino = ROOTINO; + bigino = UFS_ROOTINO; endlist = &list[nentry]; for (fp = &list[0]; fp < endlist; fp++) { if (bigino < fp->fnum) bigino = fp->fnum; if (width < fp->len) width = fp->len; if (fp->prefix != ' ') haveprefix = 1; if (fp->postfix != ' ') havepostfix = 1; } if (haveprefix) width++; if (havepostfix) width++; if (vflag) { for (precision = 0, i = bigino; i > 0; i /= 10) precision++; width += precision + 1; } width++; columns = 81 / width; if (columns == 0) columns = 1; lines = howmany(nentry, columns); for (i = 0; i < lines; i++) { for (j = 0; j < columns; j++) { fp = &list[j * lines + i]; if (vflag) { fprintf(stderr, "%*ju ", precision, (uintmax_t)fp->fnum); fp->len += precision + 1; } if (haveprefix) { putc(fp->prefix, stderr); fp->len++; } fprintf(stderr, "%s", fp->fname); if (havepostfix) { putc(fp->postfix, stderr); fp->len++; } if (fp + lines >= endlist) { fprintf(stderr, "\n"); break; } for (w = fp->len; w < width; w++) putc(' ', stderr); } } } /* * Skip over directory entries that are not on the tape * * First have to get definition of a dirent. */ #undef DIRBLKSIZ #include #undef d_ino struct dirent * glob_readdir(void *dirp) { struct direct *dp; static struct dirent adirent; while ((dp = rst_readdir(dirp)) != NULL) { - if (!vflag && dp->d_ino == WINO) + if (!vflag && dp->d_ino == UFS_WINO) continue; if (dflag || TSTINO(dp->d_ino, dumpmap)) break; } if (dp == NULL) return (NULL); adirent.d_fileno = dp->d_ino; adirent.d_namlen = dp->d_namlen; memmove(adirent.d_name, dp->d_name, dp->d_namlen + 1); return (&adirent); } /* * Return st_mode information in response to stat or lstat calls */ static int glob_stat(const char *name, struct stat *stp) { struct direct *dp; dp = pathsearch(name); if (dp == NULL || (!dflag && TSTINO(dp->d_ino, dumpmap) == 0) || - (!vflag && dp->d_ino == WINO)) + (!vflag && dp->d_ino == UFS_WINO)) return (-1); if (inodetype(dp->d_ino) == NODE) stp->st_mode = IFDIR; else stp->st_mode = IFREG; return (0); } /* * Comparison routine for qsort. */ static int fcmp(const void *f1, const void *f2) { return (strcoll(((struct afile *)f1)->fname, ((struct afile *)f2)->fname)); } /* * respond to interrupts */ void onintr(int signo __unused) { if (command == 'i' && runshell) longjmp(reset, 1); if (reply("restore interrupted, continue") == FAIL) done(1); } Index: head/sbin/restore/main.c =================================================================== --- head/sbin/restore/main.c (revision 313779) +++ head/sbin/restore/main.c (revision 313780) @@ -1,377 +1,377 @@ /* * Copyright (c) 1983, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #ifndef lint static const char copyright[] = "@(#) Copyright (c) 1983, 1993\n\ The Regents of the University of California. All rights reserved.\n"; #endif /* not lint */ #ifndef lint #if 0 static char sccsid[] = "@(#)main.c 8.6 (Berkeley) 5/4/95"; #endif #endif /* not lint */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include "restore.h" #include "extern.h" int bflag = 0, cvtflag = 0, dflag = 0, Dflag = 0, vflag = 0, yflag = 0; int hflag = 1, mflag = 1, Nflag = 0; int uflag = 0; int pipecmd = 0; char command = '\0'; long dumpnum = 1; long volno = 0; long ntrec; char *dumpmap; char *usedinomap; ino_t maxino; time_t dumptime; time_t dumpdate; FILE *terminal; static void obsolete(int *, char **[]); static void usage(void) __dead2; int main(int argc, char *argv[]) { int ch; ino_t ino; char *inputdev; char *symtbl = "./restoresymtable"; char *p, name[MAXPATHLEN]; /* Temp files should *not* be readable. We set permissions later. */ (void) umask(077); if (argc < 2) usage(); (void)setlocale(LC_ALL, ""); inputdev = NULL; obsolete(&argc, &argv); while ((ch = getopt(argc, argv, "b:dDf:himNP:Rrs:tuvxy")) != -1) switch(ch) { case 'b': /* Change default tape blocksize. */ bflag = 1; ntrec = strtol(optarg, &p, 10); if (*p) errx(1, "illegal blocksize -- %s", optarg); if (ntrec <= 0) errx(1, "block size must be greater than 0"); break; case 'd': dflag = 1; break; case 'D': Dflag = 1; break; case 'f': if (pipecmd) errx(1, "-P and -f options are mutually exclusive"); inputdev = optarg; break; case 'P': if (!pipecmd && inputdev) errx(1, "-P and -f options are mutually exclusive"); inputdev = optarg; pipecmd = 1; break; case 'h': hflag = 0; break; case 'i': case 'R': case 'r': case 't': case 'x': if (command != '\0') errx(1, "%c and %c options are mutually exclusive", ch, command); command = ch; break; case 'm': mflag = 0; break; case 'N': Nflag = 1; break; case 's': /* Dumpnum (skip to) for multifile dump tapes. */ dumpnum = strtol(optarg, &p, 10); if (*p) errx(1, "illegal dump number -- %s", optarg); if (dumpnum <= 0) errx(1, "dump number must be greater than 0"); break; case 'u': uflag = 1; break; case 'v': vflag = 1; break; case 'y': yflag = 1; break; default: usage(); } argc -= optind; argv += optind; if (command == '\0') errx(1, "none of i, R, r, t or x options specified"); if (signal(SIGINT, onintr) == SIG_IGN) (void) signal(SIGINT, SIG_IGN); if (signal(SIGTERM, onintr) == SIG_IGN) (void) signal(SIGTERM, SIG_IGN); setlinebuf(stderr); if (inputdev == NULL && (inputdev = getenv("TAPE")) == NULL) inputdev = _PATH_DEFTAPE; setinput(inputdev, pipecmd); if (argc == 0) { argc = 1; *--argv = "."; } switch (command) { /* * Interactive mode. */ case 'i': setup(); extractdirs(1); initsymtable(NULL); runcmdshell(); break; /* * Incremental restoration of a file system. */ case 'r': setup(); if (dumptime > 0) { /* * This is an incremental dump tape. */ vprintf(stdout, "Begin incremental restore\n"); initsymtable(symtbl); extractdirs(1); removeoldleaves(); vprintf(stdout, "Calculate node updates.\n"); - treescan(".", ROOTINO, nodeupdates); + treescan(".", UFS_ROOTINO, nodeupdates); findunreflinks(); removeoldnodes(); } else { /* * This is a level zero dump tape. */ vprintf(stdout, "Begin level 0 restore\n"); initsymtable((char *)0); extractdirs(1); vprintf(stdout, "Calculate extraction list.\n"); - treescan(".", ROOTINO, nodeupdates); + treescan(".", UFS_ROOTINO, nodeupdates); } createleaves(symtbl); createlinks(); setdirmodes(FORCE); checkrestore(); if (dflag) { vprintf(stdout, "Verify the directory structure\n"); - treescan(".", ROOTINO, verifyfile); + treescan(".", UFS_ROOTINO, verifyfile); } dumpsymtable(symtbl, (long)1); break; /* * Resume an incremental file system restoration. */ case 'R': initsymtable(symtbl); skipmaps(); skipdirs(); createleaves(symtbl); createlinks(); setdirmodes(FORCE); checkrestore(); dumpsymtable(symtbl, (long)1); break; /* * List contents of tape. */ case 't': setup(); extractdirs(0); initsymtable((char *)0); while (argc--) { canon(*argv++, name, sizeof(name)); ino = dirlookup(name); if (ino == 0) continue; treescan(name, ino, listfile); } break; /* * Batch extraction of tape contents. */ case 'x': setup(); extractdirs(1); initsymtable((char *)0); while (argc--) { canon(*argv++, name, sizeof(name)); ino = dirlookup(name); if (ino == 0) continue; if (mflag) pathcheck(name); treescan(name, ino, addfile); } createfiles(); createlinks(); setdirmodes(0); if (dflag) checkrestore(); break; } done(0); /* NOTREACHED */ } static void usage() { const char *const common = "[-b blocksize] [-f file | -P pipecommand] [-s fileno]"; const char *const fileell = "[file ...]"; (void)fprintf(stderr, "usage:\t%s %s\n\t%s %s\n\t%s %s\n" "\t%s %s %s\n\t%s %s %s\n", "restore -i [-dhmNuvy]", common, "restore -R [-dNuvy]", common, "restore -r [-dNuvy]", common, "restore -t [-dhNuvy]", common, fileell, "restore -x [-dhmNuvy]", common, fileell); done(1); } /* * obsolete -- * Change set of key letters and ordered arguments into something * getopt(3) will like. */ static void obsolete(int *argcp, char **argvp[]) { int argc, flags; char *ap, **argv, *flagsp, **nargv, *p; /* Setup. */ argv = *argvp; argc = *argcp; /* Return if no arguments or first argument has leading dash. */ ap = argv[1]; if (argc == 1 || *ap == '-') return; /* Allocate space for new arguments. */ if ((*argvp = nargv = malloc((argc + 1) * sizeof(char *))) == NULL || (p = flagsp = malloc(strlen(ap) + 2)) == NULL) err(1, NULL); *nargv++ = *argv; argv += 2, argc -= 2; for (flags = 0; *ap; ++ap) { switch (*ap) { case 'b': case 'f': case 's': if (*argv == NULL) { warnx("option requires an argument -- %c", *ap); usage(); } if ((nargv[0] = malloc(strlen(*argv) + 2 + 1)) == NULL) err(1, NULL); nargv[0][0] = '-'; nargv[0][1] = *ap; (void)strcpy(&nargv[0][2], *argv); ++argv; ++nargv; break; default: if (!flags) { *p++ = '-'; flags = 1; } *p++ = *ap; break; } } /* Terminate flags. */ if (flags) { *p = '\0'; *nargv++ = flagsp; } else free(flagsp); /* Copy remaining arguments. */ while ((*nargv++ = *argv++)); /* Update argument count. */ *argcp = nargv - *argvp - 1; } Index: head/sbin/restore/restore.c =================================================================== --- head/sbin/restore/restore.c (revision 313779) +++ head/sbin/restore/restore.c (revision 313780) @@ -1,860 +1,860 @@ /* * Copyright (c) 1983, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #ifndef lint #if 0 static char sccsid[] = "@(#)restore.c 8.3 (Berkeley) 9/13/94"; #endif #endif /* not lint */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include "restore.h" #include "extern.h" static char *keyval(int); /* * This implements the 't' option. * List entries on the tape. */ long listfile(char *name, ino_t ino, int type) { long descend = hflag ? GOOD : FAIL; if (TSTINO(ino, dumpmap) == 0) return (descend); vprintf(stdout, "%s", type == LEAF ? "leaf" : "dir "); fprintf(stdout, "%10ju\t%s\n", (uintmax_t)ino, name); return (descend); } /* * This implements the 'x' option. * Request that new entries be extracted. */ long addfile(char *name, ino_t ino, int type) { struct entry *ep; long descend = hflag ? GOOD : FAIL; char buf[100]; if (TSTINO(ino, dumpmap) == 0) { dprintf(stdout, "%s: not on the tape\n", name); return (descend); } - if (ino == WINO && command == 'i' && !vflag) + if (ino == UFS_WINO && command == 'i' && !vflag) return (descend); if (!mflag) { (void) sprintf(buf, "./%ju", (uintmax_t)ino); name = buf; if (type == NODE) { (void) genliteraldir(name, ino); return (descend); } } ep = lookupino(ino); if (ep != NULL) { if (strcmp(name, myname(ep)) == 0) { ep->e_flags |= NEW; return (descend); } type |= LINK; } ep = addentry(name, ino, type); if (type == NODE) newnode(ep); ep->e_flags |= NEW; return (descend); } /* * This is used by the 'i' option to undo previous requests made by addfile. * Delete entries from the request queue. */ /* ARGSUSED */ long deletefile(char *name, ino_t ino, int type) { long descend = hflag ? GOOD : FAIL; struct entry *ep; if (TSTINO(ino, dumpmap) == 0) return (descend); ep = lookupname(name); if (ep != NULL) { ep->e_flags &= ~NEW; ep->e_flags |= REMOVED; if (ep->e_type != NODE) freeentry(ep); } return (descend); } /* * The following four routines implement the incremental * restore algorithm. The first removes old entries, the second * does renames and calculates the extraction list, the third * cleans up link names missed by the first two, and the final * one deletes old directories. * * Directories cannot be immediately deleted, as they may have * other files in them which need to be moved out first. As * directories to be deleted are found, they are put on the * following deletion list. After all deletions and renames * are done, this list is actually deleted. */ static struct entry *removelist; /* * Remove invalid whiteouts from the old tree. * Remove unneeded leaves from the old tree. * Remove directories from the lookup chains. */ void removeoldleaves(void) { struct entry *ep, *nextep; ino_t i, mydirino; vprintf(stdout, "Mark entries to be removed.\n"); - if ((ep = lookupino(WINO))) { + if ((ep = lookupino(UFS_WINO))) { vprintf(stdout, "Delete whiteouts\n"); for ( ; ep != NULL; ep = nextep) { nextep = ep->e_links; mydirino = ep->e_parent->e_ino; /* * We remove all whiteouts that are in directories * that have been removed or that have been dumped. */ if (TSTINO(mydirino, usedinomap) && !TSTINO(mydirino, dumpmap)) continue; delwhiteout(ep); freeentry(ep); } } - for (i = ROOTINO + 1; i < maxino; i++) { + for (i = UFS_ROOTINO + 1; i < maxino; i++) { ep = lookupino(i); if (ep == NULL) continue; if (TSTINO(i, usedinomap)) continue; for ( ; ep != NULL; ep = ep->e_links) { dprintf(stdout, "%s: REMOVE\n", myname(ep)); if (ep->e_type == LEAF) { removeleaf(ep); freeentry(ep); } else { mktempname(ep); deleteino(ep->e_ino); ep->e_next = removelist; removelist = ep; } } } } /* * For each directory entry on the incremental tape, determine which * category it falls into as follows: * KEEP - entries that are to be left alone. * NEW - new entries to be added. * EXTRACT - files that must be updated with new contents. * LINK - new links to be added. * Renames are done at the same time. */ long nodeupdates(char *name, ino_t ino, int type) { struct entry *ep, *np, *ip; long descend = GOOD; int lookuptype = 0; int key = 0; /* key values */ # define ONTAPE 0x1 /* inode is on the tape */ # define INOFND 0x2 /* inode already exists */ # define NAMEFND 0x4 /* name already exists */ # define MODECHG 0x8 /* mode of inode changed */ /* * This routine is called once for each element in the * directory hierarchy, with a full path name. * The "type" value is incorrectly specified as LEAF for * directories that are not on the dump tape. * * Check to see if the file is on the tape. */ if (TSTINO(ino, dumpmap)) key |= ONTAPE; /* * Check to see if the name exists, and if the name is a link. */ np = lookupname(name); if (np != NULL) { key |= NAMEFND; ip = lookupino(np->e_ino); if (ip == NULL) panic("corrupted symbol table\n"); if (ip != np) lookuptype = LINK; } /* * Check to see if the inode exists, and if one of its links * corresponds to the name (if one was found). */ ip = lookupino(ino); if (ip != NULL) { key |= INOFND; for (ep = ip->e_links; ep != NULL; ep = ep->e_links) { if (ep == np) { ip = ep; break; } } } /* * If both a name and an inode are found, but they do not * correspond to the same file, then both the inode that has * been found and the inode corresponding to the name that * has been found need to be renamed. The current pathname * is the new name for the inode that has been found. Since * all files to be deleted have already been removed, the * named file is either a now unneeded link, or it must live * under a new name in this dump level. If it is a link, it * can be removed. If it is not a link, it is given a * temporary name in anticipation that it will be renamed * when it is later found by inode number. */ if (((key & (INOFND|NAMEFND)) == (INOFND|NAMEFND)) && ip != np) { if (lookuptype == LINK) { removeleaf(np); freeentry(np); } else { dprintf(stdout, "name/inode conflict, mktempname %s\n", myname(np)); mktempname(np); } np = NULL; key &= ~NAMEFND; } if ((key & ONTAPE) && (((key & INOFND) && ip->e_type != type) || ((key & NAMEFND) && np->e_type != type))) key |= MODECHG; /* * Decide on the disposition of the file based on its flags. * Note that we have already handled the case in which * a name and inode are found that correspond to different files. * Thus if both NAMEFND and INOFND are set then ip == np. */ switch (key) { /* * A previously existing file has been found. * Mark it as KEEP so that other links to the inode can be * detected, and so that it will not be reclaimed by the search * for unreferenced names. */ case INOFND|NAMEFND: ip->e_flags |= KEEP; dprintf(stdout, "[%s] %s: %s\n", keyval(key), name, flagvalues(ip)); break; /* * A file on the tape has a name which is the same as a name * corresponding to a different file in the previous dump. * Since all files to be deleted have already been removed, * this file is either a now unneeded link, or it must live * under a new name in this dump level. If it is a link, it * can simply be removed. If it is not a link, it is given a * temporary name in anticipation that it will be renamed * when it is later found by inode number (see INOFND case * below). The entry is then treated as a new file. */ case ONTAPE|NAMEFND: case ONTAPE|NAMEFND|MODECHG: if (lookuptype == LINK) { removeleaf(np); freeentry(np); } else { mktempname(np); } /* FALLTHROUGH */ /* * A previously non-existent file. * Add it to the file system, and request its extraction. * If it is a directory, create it immediately. * (Since the name is unused there can be no conflict) */ case ONTAPE: ep = addentry(name, ino, type); if (type == NODE) newnode(ep); ep->e_flags |= NEW|KEEP; dprintf(stdout, "[%s] %s: %s\n", keyval(key), name, flagvalues(ep)); break; /* * A file with the same inode number, but a different * name has been found. If the other name has not already * been found (indicated by the KEEP flag, see above) then * this must be a new name for the file, and it is renamed. * If the other name has been found then this must be a * link to the file. Hard links to directories are not * permitted, and are either deleted or converted to * symbolic links. Finally, if the file is on the tape, * a request is made to extract it. */ case ONTAPE|INOFND: if (type == LEAF && (ip->e_flags & KEEP) == 0) ip->e_flags |= EXTRACT; /* FALLTHROUGH */ case INOFND: if ((ip->e_flags & KEEP) == 0) { renameit(myname(ip), name); moveentry(ip, name); ip->e_flags |= KEEP; dprintf(stdout, "[%s] %s: %s\n", keyval(key), name, flagvalues(ip)); break; } if (ip->e_type == NODE) { descend = FAIL; fprintf(stderr, "deleted hard link %s to directory %s\n", name, myname(ip)); break; } ep = addentry(name, ino, type|LINK); ep->e_flags |= NEW; dprintf(stdout, "[%s] %s: %s|LINK\n", keyval(key), name, flagvalues(ep)); break; /* * A previously known file which is to be updated. If it is a link, * then all names referring to the previous file must be removed * so that the subset of them that remain can be recreated. */ case ONTAPE|INOFND|NAMEFND: if (lookuptype == LINK) { removeleaf(np); freeentry(np); ep = addentry(name, ino, type|LINK); if (type == NODE) newnode(ep); ep->e_flags |= NEW|KEEP; dprintf(stdout, "[%s] %s: %s|LINK\n", keyval(key), name, flagvalues(ep)); break; } if (type == LEAF && lookuptype != LINK) np->e_flags |= EXTRACT; np->e_flags |= KEEP; dprintf(stdout, "[%s] %s: %s\n", keyval(key), name, flagvalues(np)); break; /* * An inode is being reused in a completely different way. * Normally an extract can simply do an "unlink" followed * by a "creat". Here we must do effectively the same * thing. The complications arise because we cannot really * delete a directory since it may still contain files * that we need to rename, so we delete it from the symbol * table, and put it on the list to be deleted eventually. * Conversely if a directory is to be created, it must be * done immediately, rather than waiting until the * extraction phase. */ case ONTAPE|INOFND|MODECHG: case ONTAPE|INOFND|NAMEFND|MODECHG: if (ip->e_flags & KEEP) { badentry(ip, "cannot KEEP and change modes"); break; } if (ip->e_type == LEAF) { /* changing from leaf to node */ for (ip = lookupino(ino); ip != NULL; ip = ip->e_links) { if (ip->e_type != LEAF) badentry(ip, "NODE and LEAF links to same inode"); removeleaf(ip); freeentry(ip); } ip = addentry(name, ino, type); newnode(ip); } else { /* changing from node to leaf */ if ((ip->e_flags & TMPNAME) == 0) mktempname(ip); deleteino(ip->e_ino); ip->e_next = removelist; removelist = ip; ip = addentry(name, ino, type); } ip->e_flags |= NEW|KEEP; dprintf(stdout, "[%s] %s: %s\n", keyval(key), name, flagvalues(ip)); break; /* * A hard link to a directory that has been removed. * Ignore it. */ case NAMEFND: dprintf(stdout, "[%s] %s: Extraneous name\n", keyval(key), name); descend = FAIL; break; /* * If we find a directory entry for a file that is not on * the tape, then we must have found a file that was created * while the dump was in progress. Since we have no contents * for it, we discard the name knowing that it will be on the * next incremental tape. */ case 0: fprintf(stderr, "%s: (inode %ju) not found on tape\n", name, (uintmax_t)ino); break; /* * If any of these arise, something is grievously wrong with * the current state of the symbol table. */ case INOFND|NAMEFND|MODECHG: case NAMEFND|MODECHG: case INOFND|MODECHG: fprintf(stderr, "[%s] %s: inconsistent state\n", keyval(key), name); break; /* * These states "cannot" arise for any state of the symbol table. */ case ONTAPE|MODECHG: case MODECHG: default: panic("[%s] %s: impossible state\n", keyval(key), name); break; } return (descend); } /* * Calculate the active flags in a key. */ static char * keyval(int key) { static char keybuf[32]; (void) strcpy(keybuf, "|NIL"); keybuf[0] = '\0'; if (key & ONTAPE) (void) strcat(keybuf, "|ONTAPE"); if (key & INOFND) (void) strcat(keybuf, "|INOFND"); if (key & NAMEFND) (void) strcat(keybuf, "|NAMEFND"); if (key & MODECHG) (void) strcat(keybuf, "|MODECHG"); return (&keybuf[1]); } /* * Find unreferenced link names. */ void findunreflinks(void) { struct entry *ep, *np; ino_t i; vprintf(stdout, "Find unreferenced names.\n"); - for (i = ROOTINO; i < maxino; i++) { + for (i = UFS_ROOTINO; i < maxino; i++) { ep = lookupino(i); if (ep == NULL || ep->e_type == LEAF || TSTINO(i, dumpmap) == 0) continue; for (np = ep->e_entries; np != NULL; np = np->e_sibling) { if (np->e_flags == 0) { dprintf(stdout, "%s: remove unreferenced name\n", myname(np)); removeleaf(np); freeentry(np); } } } /* * Any leaves remaining in removed directories is unreferenced. */ for (ep = removelist; ep != NULL; ep = ep->e_next) { for (np = ep->e_entries; np != NULL; np = np->e_sibling) { if (np->e_type == LEAF) { if (np->e_flags != 0) badentry(np, "unreferenced with flags"); dprintf(stdout, "%s: remove unreferenced name\n", myname(np)); removeleaf(np); freeentry(np); } } } } /* * Remove old nodes (directories). * Note that this routine runs in O(N*D) where: * N is the number of directory entries to be removed. * D is the maximum depth of the tree. * If N == D this can be quite slow. If the list were * topologically sorted, the deletion could be done in * time O(N). */ void removeoldnodes(void) { struct entry *ep, **prev; long change; vprintf(stdout, "Remove old nodes (directories).\n"); do { change = 0; prev = &removelist; for (ep = removelist; ep != NULL; ep = *prev) { if (ep->e_entries != NULL) { prev = &ep->e_next; continue; } *prev = ep->e_next; removenode(ep); freeentry(ep); change++; } } while (change); for (ep = removelist; ep != NULL; ep = ep->e_next) badentry(ep, "cannot remove, non-empty"); } /* * This is the routine used to extract files for the 'r' command. * Extract new leaves. */ void createleaves(char *symtabfile) { struct entry *ep; ino_t first; long curvol; if (command == 'R') { vprintf(stdout, "Continue extraction of new leaves\n"); } else { vprintf(stdout, "Extract new leaves.\n"); dumpsymtable(symtabfile, volno); } - first = lowerbnd(ROOTINO); + first = lowerbnd(UFS_ROOTINO); curvol = volno; while (curfile.ino < maxino) { first = lowerbnd(first); /* * If the next available file is not the one which we * expect then we have missed one or more files. Since * we do not request files that were not on the tape, * the lost files must have been due to a tape read error, * or a file that was removed while the dump was in progress. */ while (first < curfile.ino) { ep = lookupino(first); if (ep == NULL) panic("%ju: bad first\n", (uintmax_t)first); fprintf(stderr, "%s: not found on tape\n", myname(ep)); ep->e_flags &= ~(NEW|EXTRACT); first = lowerbnd(first); } /* * If we find files on the tape that have no corresponding * directory entries, then we must have found a file that * was created while the dump was in progress. Since we have * no name for it, we discard it knowing that it will be * on the next incremental tape. */ if (first != curfile.ino) { fprintf(stderr, "expected next file %ju, got %ju\n", (uintmax_t)first, (uintmax_t)curfile.ino); skipfile(); goto next; } ep = lookupino(curfile.ino); if (ep == NULL) panic("unknown file on tape\n"); if ((ep->e_flags & (NEW|EXTRACT)) == 0) badentry(ep, "unexpected file on tape"); /* * If the file is to be extracted, then the old file must * be removed since its type may change from one leaf type * to another (e.g. "file" to "character special"). */ if ((ep->e_flags & EXTRACT) != 0) { removeleaf(ep); ep->e_flags &= ~REMOVED; } (void) extractfile(myname(ep)); ep->e_flags &= ~(NEW|EXTRACT); /* * We checkpoint the restore after every tape reel, so * as to simplify the amount of work required by the * 'R' command. */ next: if (curvol != volno) { dumpsymtable(symtabfile, volno); skipmaps(); curvol = volno; } } } /* * This is the routine used to extract files for the 'x' and 'i' commands. * Efficiently extract a subset of the files on a tape. */ void createfiles(void) { ino_t first, next, last; struct entry *ep; long curvol; vprintf(stdout, "Extract requested files\n"); curfile.action = SKIP; getvol((long)1); skipmaps(); skipdirs(); - first = lowerbnd(ROOTINO); + first = lowerbnd(UFS_ROOTINO); last = upperbnd(maxino - 1); for (;;) { curvol = volno; first = lowerbnd(first); last = upperbnd(last); /* * Check to see if any files remain to be extracted */ if (first > last) return; if (Dflag) { if (curfile.ino == maxino) return; if((ep = lookupino(curfile.ino)) != NULL && (ep->e_flags & (NEW|EXTRACT))) { goto justgetit; } else { skipfile(); continue; } } /* * Reject any volumes with inodes greater than the last * one needed, so that we can quickly skip backwards to * a volume containing useful inodes. We can't do this * if there are no further volumes available (curfile.ino * >= maxino) or if we are already at the first tape. */ if (curfile.ino > last && curfile.ino < maxino && volno > 1) { curfile.action = SKIP; getvol((long)0); skipmaps(); skipdirs(); continue; } /* * Decide on the next inode needed. * Skip across the inodes until it is found * or a volume change is encountered */ if (curfile.ino < maxino) { next = lowerbnd(curfile.ino); while (next > curfile.ino && volno == curvol) skipfile(); if (volno != curvol) { skipmaps(); skipdirs(); continue; } } else { /* * No further volumes or inodes available. Set * `next' to the first inode, so that a warning * is emitted below for each missing file. */ next = first; } /* * If the current inode is greater than the one we were * looking for then we missed the one we were looking for. * Since we only attempt to extract files listed in the * dump map, the lost files must have been due to a tape * read error, or a file that was removed while the dump * was in progress. Thus we report all requested files * between the one we were looking for, and the one we * found as missing, and delete their request flags. */ while (next < curfile.ino) { ep = lookupino(next); if (ep == NULL) panic("corrupted symbol table\n"); fprintf(stderr, "%s: not found on tape\n", myname(ep)); ep->e_flags &= ~NEW; next = lowerbnd(next); } /* * The current inode is the one that we are looking for, * so extract it per its requested name. */ if (next == curfile.ino && next <= last) { ep = lookupino(next); if (ep == NULL) panic("corrupted symbol table\n"); justgetit: (void) extractfile(myname(ep)); ep->e_flags &= ~NEW; if (volno != curvol) skipmaps(); } } } /* * Add links. */ void createlinks(void) { struct entry *np, *ep; ino_t i; char name[BUFSIZ]; - if ((ep = lookupino(WINO))) { + if ((ep = lookupino(UFS_WINO))) { vprintf(stdout, "Add whiteouts\n"); for ( ; ep != NULL; ep = ep->e_links) { if ((ep->e_flags & NEW) == 0) continue; (void) addwhiteout(myname(ep)); ep->e_flags &= ~NEW; } } vprintf(stdout, "Add links\n"); - for (i = ROOTINO; i < maxino; i++) { + for (i = UFS_ROOTINO; i < maxino; i++) { ep = lookupino(i); if (ep == NULL) continue; for (np = ep->e_links; np != NULL; np = np->e_links) { if ((np->e_flags & NEW) == 0) continue; (void) strcpy(name, myname(ep)); if (ep->e_type == NODE) { (void) linkit(name, myname(np), SYMLINK); } else { (void) linkit(name, myname(np), HARDLINK); } np->e_flags &= ~NEW; } } } /* * Check the symbol table. * We do this to insure that all the requested work was done, and * that no temporary names remain. */ void checkrestore(void) { struct entry *ep; ino_t i; vprintf(stdout, "Check the symbol table.\n"); - for (i = WINO; i < maxino; i++) { + for (i = UFS_WINO; i < maxino; i++) { for (ep = lookupino(i); ep != NULL; ep = ep->e_links) { ep->e_flags &= ~KEEP; if (ep->e_type == NODE) ep->e_flags &= ~(NEW|EXISTED); if (ep->e_flags != 0) badentry(ep, "incomplete operations"); } } } /* * Compare with the directory structure on the tape * A paranoid check that things are as they should be. */ long verifyfile(char *name, ino_t ino, int type) { struct entry *np, *ep; long descend = GOOD; ep = lookupname(name); if (ep == NULL) { fprintf(stderr, "Warning: missing name %s\n", name); return (FAIL); } np = lookupino(ino); if (np != ep) descend = FAIL; for ( ; np != NULL; np = np->e_links) if (np == ep) break; if (np == NULL) panic("missing inumber %ju\n", (uintmax_t)ino); if (ep->e_type == LEAF && type != LEAF) badentry(ep, "type should be LEAF"); return (descend); } Index: head/sbin/restore/symtab.c =================================================================== --- head/sbin/restore/symtab.c (revision 313779) +++ head/sbin/restore/symtab.c (revision 313780) @@ -1,615 +1,615 @@ /* * Copyright (c) 1983, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #ifndef lint #if 0 static char sccsid[] = "@(#)symtab.c 8.3 (Berkeley) 4/28/95"; #endif static const char rcsid[] = "$FreeBSD$"; #endif /* not lint */ /* * These routines maintain the symbol table which tracks the state * of the file system being restored. They provide lookup by either * name or inode number. They also provide for creation, deletion, * and renaming of entries. Because of the dynamic nature of pathnames, * names should not be saved, but always constructed just before they * are needed, by calling "myname". */ #include #include #include #include #include #include #include #include #include #include #include #include "restore.h" #include "extern.h" /* * The following variables define the inode symbol table. * The primary hash table is dynamically allocated based on * the number of inodes in the file system (maxino), scaled by * HASHFACTOR. The variable "entry" points to the hash table; * the variable "entrytblsize" indicates its size (in entries). */ #define HASHFACTOR 5 static struct entry **entry; static long entrytblsize; static void addino(ino_t, struct entry *); static struct entry *lookupparent(char *); static void removeentry(struct entry *); /* * Look up an entry by inode number */ struct entry * lookupino(ino_t inum) { struct entry *ep; - if (inum < WINO || inum >= maxino) + if (inum < UFS_WINO || inum >= maxino) return (NULL); for (ep = entry[inum % entrytblsize]; ep != NULL; ep = ep->e_next) if (ep->e_ino == inum) return (ep); return (NULL); } /* * Add an entry into the entry table */ static void addino(ino_t inum, struct entry *np) { struct entry **epp; - if (inum < WINO || inum >= maxino) + if (inum < UFS_WINO || inum >= maxino) panic("addino: out of range %ju\n", (uintmax_t)inum); epp = &entry[inum % entrytblsize]; np->e_ino = inum; np->e_next = *epp; *epp = np; if (dflag) for (np = np->e_next; np != NULL; np = np->e_next) if (np->e_ino == inum) badentry(np, "duplicate inum"); } /* * Delete an entry from the entry table */ void deleteino(ino_t inum) { struct entry *next; struct entry **prev; - if (inum < WINO || inum >= maxino) + if (inum < UFS_WINO || inum >= maxino) panic("deleteino: out of range %ju\n", (uintmax_t)inum); prev = &entry[inum % entrytblsize]; for (next = *prev; next != NULL; next = next->e_next) { if (next->e_ino == inum) { next->e_ino = 0; *prev = next->e_next; return; } prev = &next->e_next; } panic("deleteino: %ju not found\n", (uintmax_t)inum); } /* * Look up an entry by name */ struct entry * lookupname(char *name) { struct entry *ep; char *np, *cp; char buf[MAXPATHLEN]; cp = name; - for (ep = lookupino(ROOTINO); ep != NULL; ep = ep->e_entries) { + for (ep = lookupino(UFS_ROOTINO); ep != NULL; ep = ep->e_entries) { for (np = buf; *cp != '/' && *cp != '\0' && np < &buf[sizeof(buf)]; ) *np++ = *cp++; if (np == &buf[sizeof(buf)]) break; *np = '\0'; for ( ; ep != NULL; ep = ep->e_sibling) if (strcmp(ep->e_name, buf) == 0) break; if (ep == NULL) break; if (*cp++ == '\0') return (ep); } return (NULL); } /* * Look up the parent of a pathname */ static struct entry * lookupparent(char *name) { struct entry *ep; char *tailindex; tailindex = strrchr(name, '/'); if (tailindex == NULL) return (NULL); *tailindex = '\0'; ep = lookupname(name); *tailindex = '/'; if (ep == NULL) return (NULL); if (ep->e_type != NODE) panic("%s is not a directory\n", name); return (ep); } /* * Determine the current pathname of a node or leaf */ char * myname(struct entry *ep) { char *cp; static char namebuf[MAXPATHLEN]; for (cp = &namebuf[MAXPATHLEN - 2]; cp > &namebuf[ep->e_namlen]; ) { cp -= ep->e_namlen; memmove(cp, ep->e_name, (long)ep->e_namlen); - if (ep == lookupino(ROOTINO)) + if (ep == lookupino(UFS_ROOTINO)) return (cp); *(--cp) = '/'; ep = ep->e_parent; } panic("%s: pathname too long\n", cp); return(cp); } /* * Unused symbol table entries are linked together on a free list * headed by the following pointer. */ static struct entry *freelist = NULL; /* * add an entry to the symbol table */ struct entry * addentry(char *name, ino_t inum, int type) { struct entry *np, *ep; if (freelist != NULL) { np = freelist; freelist = np->e_next; memset(np, 0, (long)sizeof(struct entry)); } else { np = (struct entry *)calloc(1, sizeof(struct entry)); if (np == NULL) panic("no memory to extend symbol table\n"); } np->e_type = type & ~LINK; ep = lookupparent(name); if (ep == NULL) { - if (inum != ROOTINO || lookupino(ROOTINO) != NULL) + if (inum != UFS_ROOTINO || lookupino(UFS_ROOTINO) != NULL) panic("bad name to addentry %s\n", name); np->e_name = savename(name); np->e_namlen = strlen(name); np->e_parent = np; - addino(ROOTINO, np); + addino(UFS_ROOTINO, np); return (np); } np->e_name = savename(strrchr(name, '/') + 1); np->e_namlen = strlen(np->e_name); np->e_parent = ep; np->e_sibling = ep->e_entries; ep->e_entries = np; if (type & LINK) { ep = lookupino(inum); if (ep == NULL) panic("link to non-existent name\n"); np->e_ino = inum; np->e_links = ep->e_links; ep->e_links = np; } else if (inum != 0) { if (lookupino(inum) != NULL) panic("duplicate entry\n"); addino(inum, np); } return (np); } /* * delete an entry from the symbol table */ void freeentry(struct entry *ep) { struct entry *np; ino_t inum; if (ep->e_flags != REMOVED) badentry(ep, "not marked REMOVED"); if (ep->e_type == NODE) { if (ep->e_links != NULL) badentry(ep, "freeing referenced directory"); if (ep->e_entries != NULL) badentry(ep, "freeing non-empty directory"); } if (ep->e_ino != 0) { np = lookupino(ep->e_ino); if (np == NULL) badentry(ep, "lookupino failed"); if (np == ep) { inum = ep->e_ino; deleteino(inum); if (ep->e_links != NULL) addino(inum, ep->e_links); } else { for (; np != NULL; np = np->e_links) { if (np->e_links == ep) { np->e_links = ep->e_links; break; } } if (np == NULL) badentry(ep, "link not found"); } } removeentry(ep); freename(ep->e_name); ep->e_next = freelist; freelist = ep; } /* * Relocate an entry in the tree structure */ void moveentry(struct entry *ep, char *newname) { struct entry *np; char *cp; np = lookupparent(newname); if (np == NULL) badentry(ep, "cannot move ROOT"); if (np != ep->e_parent) { removeentry(ep); ep->e_parent = np; ep->e_sibling = np->e_entries; np->e_entries = ep; } cp = strrchr(newname, '/') + 1; freename(ep->e_name); ep->e_name = savename(cp); ep->e_namlen = strlen(cp); if (strcmp(gentempname(ep), ep->e_name) == 0) ep->e_flags |= TMPNAME; else ep->e_flags &= ~TMPNAME; } /* * Remove an entry in the tree structure */ static void removeentry(struct entry *ep) { struct entry *np; np = ep->e_parent; if (np->e_entries == ep) { np->e_entries = ep->e_sibling; } else { for (np = np->e_entries; np != NULL; np = np->e_sibling) { if (np->e_sibling == ep) { np->e_sibling = ep->e_sibling; break; } } if (np == NULL) badentry(ep, "cannot find entry in parent list"); } } /* * Table of unused string entries, sorted by length. * * Entries are allocated in STRTBLINCR sized pieces so that names * of similar lengths can use the same entry. The value of STRTBLINCR * is chosen so that every entry has at least enough space to hold * a "struct strtbl" header. Thus every entry can be linked onto an * appropriate free list. * * NB. The macro "allocsize" below assumes that "struct strhdr" * has a size that is a power of two. */ struct strhdr { struct strhdr *next; }; #define STRTBLINCR (sizeof(struct strhdr)) #define allocsize(size) roundup2((size) + 1, STRTBLINCR) static struct strhdr strtblhdr[allocsize(NAME_MAX) / STRTBLINCR]; /* * Allocate space for a name. It first looks to see if it already * has an appropriate sized entry, and if not allocates a new one. */ char * savename(char *name) { struct strhdr *np; size_t len; char *cp; if (name == NULL) panic("bad name\n"); len = strlen(name); np = strtblhdr[len / STRTBLINCR].next; if (np != NULL) { strtblhdr[len / STRTBLINCR].next = np->next; cp = (char *)np; } else { cp = malloc(allocsize(len)); if (cp == NULL) panic("no space for string table\n"); } (void) strcpy(cp, name); return (cp); } /* * Free space for a name. The resulting entry is linked onto the * appropriate free list. */ void freename(char *name) { struct strhdr *tp, *np; tp = &strtblhdr[strlen(name) / STRTBLINCR]; np = (struct strhdr *)name; np->next = tp->next; tp->next = np; } /* * Useful quantities placed at the end of a dumped symbol table. */ struct symtableheader { int32_t volno; int32_t stringsize; int32_t entrytblsize; time_t dumptime; time_t dumpdate; ino_t maxino; int32_t ntrec; }; /* * dump a snapshot of the symbol table */ void dumpsymtable(char *filename, long checkpt) { struct entry *ep, *tep; ino_t i; struct entry temp, *tentry; long mynum = 1, stroff = 0; FILE *fd; struct symtableheader hdr; vprintf(stdout, "Checkpointing the restore\n"); if (Nflag) return; if ((fd = fopen(filename, "w")) == NULL) { fprintf(stderr, "fopen: %s\n", strerror(errno)); panic("cannot create save file %s for symbol table\n", filename); done(1); } clearerr(fd); /* * Assign indices to each entry * Write out the string entries */ - for (i = WINO; i <= maxino; i++) { + for (i = UFS_WINO; i <= maxino; i++) { for (ep = lookupino(i); ep != NULL; ep = ep->e_links) { ep->e_index = mynum++; (void) fwrite(ep->e_name, sizeof(char), (int)allocsize(ep->e_namlen), fd); } } /* * Convert pointers to indexes, and output */ tep = &temp; stroff = 0; - for (i = WINO; i <= maxino; i++) { + for (i = UFS_WINO; i <= maxino; i++) { for (ep = lookupino(i); ep != NULL; ep = ep->e_links) { memmove(tep, ep, (long)sizeof(struct entry)); tep->e_name = (char *)stroff; stroff += allocsize(ep->e_namlen); tep->e_parent = (struct entry *)ep->e_parent->e_index; if (ep->e_links != NULL) tep->e_links = (struct entry *)ep->e_links->e_index; if (ep->e_sibling != NULL) tep->e_sibling = (struct entry *)ep->e_sibling->e_index; if (ep->e_entries != NULL) tep->e_entries = (struct entry *)ep->e_entries->e_index; if (ep->e_next != NULL) tep->e_next = (struct entry *)ep->e_next->e_index; (void) fwrite((char *)tep, sizeof(struct entry), 1, fd); } } /* * Convert entry pointers to indexes, and output */ for (i = 0; i < entrytblsize; i++) { if (entry[i] == NULL) tentry = NULL; else tentry = (struct entry *)entry[i]->e_index; (void) fwrite((char *)&tentry, sizeof(struct entry *), 1, fd); } hdr.volno = checkpt; hdr.maxino = maxino; hdr.entrytblsize = entrytblsize; hdr.stringsize = stroff; hdr.dumptime = dumptime; hdr.dumpdate = dumpdate; hdr.ntrec = ntrec; (void) fwrite((char *)&hdr, sizeof(struct symtableheader), 1, fd); if (ferror(fd)) { fprintf(stderr, "fwrite: %s\n", strerror(errno)); panic("output error to file %s writing symbol table\n", filename); } (void) fclose(fd); } /* * Initialize a symbol table from a file */ void initsymtable(char *filename) { char *base; long tblsize; struct entry *ep; struct entry *baseep, *lep; struct symtableheader hdr; struct stat stbuf; long i; int fd; vprintf(stdout, "Initialize symbol table.\n"); if (filename == NULL) { entrytblsize = maxino / HASHFACTOR; entry = calloc((unsigned)entrytblsize, sizeof(struct entry *)); if (entry == NULL) panic("no memory for entry table\n"); - ep = addentry(".", ROOTINO, NODE); + ep = addentry(".", UFS_ROOTINO, NODE); ep->e_flags |= NEW; return; } if ((fd = open(filename, O_RDONLY, 0)) < 0) { fprintf(stderr, "open: %s\n", strerror(errno)); panic("cannot open symbol table file %s\n", filename); } if (fstat(fd, &stbuf) < 0) { fprintf(stderr, "stat: %s\n", strerror(errno)); panic("cannot stat symbol table file %s\n", filename); } tblsize = stbuf.st_size - sizeof(struct symtableheader); base = calloc(sizeof(char), (unsigned)tblsize); if (base == NULL) panic("cannot allocate space for symbol table\n"); if (read(fd, base, (int)tblsize) < 0 || read(fd, (char *)&hdr, sizeof(struct symtableheader)) < 0) { fprintf(stderr, "read: %s\n", strerror(errno)); panic("cannot read symbol table file %s\n", filename); } (void)close(fd); switch (command) { case 'r': /* * For normal continuation, insure that we are using * the next incremental tape */ if (hdr.dumpdate != dumptime) { if (hdr.dumpdate < dumptime) fprintf(stderr, "Incremental tape too low\n"); else fprintf(stderr, "Incremental tape too high\n"); done(1); } break; case 'R': /* * For restart, insure that we are using the same tape */ curfile.action = SKIP; dumptime = hdr.dumptime; dumpdate = hdr.dumpdate; if (!bflag) newtapebuf(hdr.ntrec); getvol(hdr.volno); break; default: panic("initsymtable called from command %c\n", command); break; } maxino = hdr.maxino; entrytblsize = hdr.entrytblsize; entry = (struct entry **) (base + tblsize - (entrytblsize * sizeof(struct entry *))); baseep = (struct entry *)(base + hdr.stringsize - sizeof(struct entry)); lep = (struct entry *)entry; for (i = 0; i < entrytblsize; i++) { if (entry[i] == NULL) continue; entry[i] = &baseep[(long)entry[i]]; } for (ep = &baseep[1]; ep < lep; ep++) { ep->e_name = base + (long)ep->e_name; ep->e_parent = &baseep[(long)ep->e_parent]; if (ep->e_sibling != NULL) ep->e_sibling = &baseep[(long)ep->e_sibling]; if (ep->e_links != NULL) ep->e_links = &baseep[(long)ep->e_links]; if (ep->e_entries != NULL) ep->e_entries = &baseep[(long)ep->e_entries]; if (ep->e_next != NULL) ep->e_next = &baseep[(long)ep->e_next]; } } Index: head/sbin/restore/tape.c =================================================================== --- head/sbin/restore/tape.c (revision 313779) +++ head/sbin/restore/tape.c (revision 313780) @@ -1,1700 +1,1700 @@ /* * Copyright (c) 1983, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #ifndef lint #if 0 static char sccsid[] = "@(#)tape.c 8.9 (Berkeley) 5/1/95"; #endif #endif /* not lint */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "restore.h" #include "extern.h" static long fssize = MAXBSIZE; static int mt = -1; static int pipein = 0; static int pipecmdin = 0; static FILE *popenfp = NULL; static char *magtape; static int blkcnt; static int numtrec; static char *tapebuf; static union u_spcl endoftapemark; static long byteslide = 0; static long blksread; /* blocks read since last header */ static int64_t tapeaddr = 0; /* current TP_BSIZE tape record */ static long tapesread; static jmp_buf restart; static int gettingfile = 0; /* restart has a valid frame */ static char *host = NULL; static int readmapflag; static int ofile; static char *map; static char lnkbuf[MAXPATHLEN + 1]; static int pathlen; int Bcvt; /* Swap Bytes */ int oldinofmt; /* FreeBSD 1 inode format needs cvt */ #define FLUSHTAPEBUF() blkcnt = ntrec + 1 char *namespace_names[] = EXTATTR_NAMESPACE_NAMES; static void accthdr(struct s_spcl *); static int checksum(int *); static void findinode(struct s_spcl *); static void findtapeblksize(void); static char *setupextattr(int); static void xtrattr(char *, size_t); static void skiphole(void (*)(char *, size_t), size_t *); static int gethead(struct s_spcl *); static void readtape(char *); static void setdumpnum(void); static u_long swabl(u_long); static u_char *swablong(u_char *, int); static u_char *swabshort(u_char *, int); static void terminateinput(void); static void xtrfile(char *, size_t); static void xtrlnkfile(char *, size_t); static void xtrlnkskip(char *, size_t); static void xtrmap(char *, size_t); static void xtrmapskip(char *, size_t); static void xtrskip(char *, size_t); /* * Set up an input source */ void setinput(char *source, int ispipecommand) { FLUSHTAPEBUF(); if (bflag) newtapebuf(ntrec); else newtapebuf(MAX(NTREC, HIGHDENSITYTREC)); terminal = stdin; if (ispipecommand) pipecmdin++; else #ifdef RRESTORE if (strchr(source, ':')) { host = source; source = strchr(host, ':'); *source++ = '\0'; if (rmthost(host) == 0) done(1); } else #endif if (strcmp(source, "-") == 0) { /* * Since input is coming from a pipe we must establish * our own connection to the terminal. */ terminal = fopen(_PATH_TTY, "r"); if (terminal == NULL) { (void)fprintf(stderr, "cannot open %s: %s\n", _PATH_TTY, strerror(errno)); terminal = fopen(_PATH_DEVNULL, "r"); if (terminal == NULL) { (void)fprintf(stderr, "cannot open %s: %s\n", _PATH_DEVNULL, strerror(errno)); done(1); } } pipein++; } /* no longer need or want root privileges */ if (setuid(getuid()) != 0) { fprintf(stderr, "setuid failed\n"); done(1); } magtape = strdup(source); if (magtape == NULL) { fprintf(stderr, "Cannot allocate space for magtape buffer\n"); done(1); } } void newtapebuf(long size) { static int tapebufsize = -1; ntrec = size; if (size <= tapebufsize) return; if (tapebuf != NULL) free(tapebuf - TP_BSIZE); tapebuf = malloc((size+1) * TP_BSIZE); if (tapebuf == NULL) { fprintf(stderr, "Cannot allocate space for tape buffer\n"); done(1); } tapebuf += TP_BSIZE; tapebufsize = size; } /* * Verify that the tape drive can be accessed and * that it actually is a dump tape. */ void setup(void) { int i, j, *ip; struct stat stbuf; vprintf(stdout, "Verify tape and initialize maps\n"); if (pipecmdin) { if (setenv("RESTORE_VOLUME", "1", 1) == -1) { fprintf(stderr, "Cannot set $RESTORE_VOLUME: %s\n", strerror(errno)); done(1); } popenfp = popen(magtape, "r"); mt = popenfp ? fileno(popenfp) : -1; } else #ifdef RRESTORE if (host) mt = rmtopen(magtape, 0); else #endif if (pipein) mt = 0; else mt = open(magtape, O_RDONLY, 0); if (mt < 0) { fprintf(stderr, "%s: %s\n", magtape, strerror(errno)); done(1); } volno = 1; setdumpnum(); FLUSHTAPEBUF(); if (!pipein && !pipecmdin && !bflag) findtapeblksize(); if (gethead(&spcl) == FAIL) { fprintf(stderr, "Tape is not a dump tape\n"); done(1); } if (pipein) { endoftapemark.s_spcl.c_magic = FS_UFS2_MAGIC; endoftapemark.s_spcl.c_type = TS_END; ip = (int *)&endoftapemark; j = sizeof(union u_spcl) / sizeof(int); i = 0; do i += *ip++; while (--j); endoftapemark.s_spcl.c_checksum = CHECKSUM - i; } if (vflag || command == 't') printdumpinfo(); dumptime = _time64_to_time(spcl.c_ddate); dumpdate = _time64_to_time(spcl.c_date); if (stat(".", &stbuf) < 0) { fprintf(stderr, "cannot stat .: %s\n", strerror(errno)); done(1); } if (stbuf.st_blksize > 0 && stbuf.st_blksize < TP_BSIZE ) fssize = TP_BSIZE; if (stbuf.st_blksize >= TP_BSIZE && stbuf.st_blksize <= MAXBSIZE) fssize = stbuf.st_blksize; if (((TP_BSIZE - 1) & stbuf.st_blksize) != 0) { fprintf(stderr, "Warning: filesystem with non-multiple-of-%d " "blocksize (%d);\n", TP_BSIZE, stbuf.st_blksize); fssize = roundup(fssize, TP_BSIZE); fprintf(stderr, "\twriting using blocksize %ld\n", fssize); } if (spcl.c_volume != 1) { fprintf(stderr, "Tape is not volume 1 of the dump\n"); done(1); } if (gethead(&spcl) == FAIL) { dprintf(stdout, "header read failed at %ld blocks\n", blksread); panic("no header after volume mark!\n"); } findinode(&spcl); if (spcl.c_type != TS_CLRI) { fprintf(stderr, "Cannot find file removal list\n"); done(1); } maxino = (spcl.c_count * TP_BSIZE * NBBY) + 1; dprintf(stdout, "maxino = %ju\n", (uintmax_t)maxino); map = calloc((unsigned)1, (unsigned)howmany(maxino, NBBY)); if (map == NULL) panic("no memory for active inode map\n"); usedinomap = map; curfile.action = USING; getfile(xtrmap, xtrmapskip, xtrmapskip); if (spcl.c_type != TS_BITS) { fprintf(stderr, "Cannot find file dump list\n"); done(1); } map = calloc((unsigned)1, (unsigned)howmany(maxino, NBBY)); if (map == (char *)NULL) panic("no memory for file dump list\n"); dumpmap = map; curfile.action = USING; getfile(xtrmap, xtrmapskip, xtrmapskip); /* * If there may be whiteout entries on the tape, pretend that the * whiteout inode exists, so that the whiteout entries can be * extracted. */ - SETINO(WINO, dumpmap); + SETINO(UFS_WINO, dumpmap); /* 'r' restores don't call getvol() for tape 1, so mark it as read. */ if (command == 'r') tapesread = 1; } /* * Prompt user to load a new dump volume. * "Nextvol" is the next suggested volume to use. * This suggested volume is enforced when doing full * or incremental restores, but can be overridden by * the user when only extracting a subset of the files. */ void getvol(long nextvol) { int64_t prevtapea; long i, newvol, savecnt; union u_spcl tmpspcl; # define tmpbuf tmpspcl.s_spcl char buf[TP_BSIZE]; if (nextvol == 1) { tapesread = 0; gettingfile = 0; } prevtapea = tapeaddr; savecnt = blksread; if (pipein) { if (nextvol != 1) { panic("Changing volumes on pipe input?\n"); /* Avoid looping if we couldn't ask the user. */ if (yflag || ferror(terminal) || feof(terminal)) done(1); } if (volno == 1) return; newvol = 0; goto gethdr; } again: if (pipein) done(1); /* pipes do not get a second chance */ if (command == 'R' || command == 'r' || curfile.action != SKIP) newvol = nextvol; else newvol = 0; while (newvol <= 0) { if (tapesread == 0) { fprintf(stderr, "%s%s%s%s%s%s%s", "You have not read any tapes yet.\n", "If you are extracting just a few files,", " start with the last volume\n", "and work towards the first; restore", " can quickly skip tapes that\n", "have no further files to extract.", " Otherwise, begin with volume 1.\n"); } else { fprintf(stderr, "You have read volumes"); strcpy(buf, ": "); for (i = 0; i < 32; i++) if (tapesread & (1 << i)) { fprintf(stderr, "%s%ld", buf, i + 1); strcpy(buf, ", "); } fprintf(stderr, "\n"); } do { fprintf(stderr, "Specify next volume #: "); (void) fflush(stderr); if (fgets(buf, BUFSIZ, terminal) == NULL) done(1); } while (buf[0] == '\n'); newvol = atoi(buf); if (newvol <= 0) { fprintf(stderr, "Volume numbers are positive numerics\n"); } } if (newvol == volno) { tapesread |= 1 << (volno - 1); return; } closemt(); fprintf(stderr, "Mount tape volume %ld\n", newvol); fprintf(stderr, "Enter ``none'' if there are no more tapes\n"); fprintf(stderr, "otherwise enter tape name (default: %s) ", magtape); (void) fflush(stderr); if (fgets(buf, BUFSIZ, terminal) == NULL) done(1); if (!strcmp(buf, "none\n")) { terminateinput(); return; } if (buf[0] != '\n') { (void) strcpy(magtape, buf); magtape[strlen(magtape) - 1] = '\0'; } if (pipecmdin) { char volno[sizeof("2147483647")]; (void)sprintf(volno, "%ld", newvol); if (setenv("RESTORE_VOLUME", volno, 1) == -1) { fprintf(stderr, "Cannot set $RESTORE_VOLUME: %s\n", strerror(errno)); done(1); } popenfp = popen(magtape, "r"); mt = popenfp ? fileno(popenfp) : -1; } else #ifdef RRESTORE if (host) mt = rmtopen(magtape, 0); else #endif mt = open(magtape, O_RDONLY, 0); if (mt == -1) { fprintf(stderr, "Cannot open %s\n", magtape); volno = -1; goto again; } gethdr: volno = newvol; setdumpnum(); FLUSHTAPEBUF(); if (gethead(&tmpbuf) == FAIL) { dprintf(stdout, "header read failed at %ld blocks\n", blksread); fprintf(stderr, "tape is not dump tape\n"); volno = 0; goto again; } if (tmpbuf.c_volume != volno) { fprintf(stderr, "Wrong volume (%jd)\n", (intmax_t)tmpbuf.c_volume); volno = 0; goto again; } if (_time64_to_time(tmpbuf.c_date) != dumpdate || _time64_to_time(tmpbuf.c_ddate) != dumptime) { time_t t = _time64_to_time(tmpbuf.c_date); fprintf(stderr, "Wrong dump date\n\tgot: %s", ctime(&t)); fprintf(stderr, "\twanted: %s", ctime(&dumpdate)); volno = 0; goto again; } tapesread |= 1 << (volno - 1); blksread = savecnt; /* * If continuing from the previous volume, skip over any * blocks read already at the end of the previous volume. * * If coming to this volume at random, skip to the beginning * of the next record. */ dprintf(stdout, "last rec %jd, tape starts with %jd\n", (intmax_t)prevtapea, (intmax_t)tmpbuf.c_tapea); if (tmpbuf.c_type == TS_TAPE) { if (curfile.action != USING) { /* * XXX Dump incorrectly sets c_count to 1 in the * volume header of the first tape, so ignore * c_count when volno == 1. */ if (volno != 1) for (i = tmpbuf.c_count; i > 0; i--) readtape(buf); } else if (tmpbuf.c_tapea <= prevtapea) { /* * Normally the value of c_tapea in the volume * header is the record number of the header itself. * However in the volume header following an EOT- * terminated tape, it is the record number of the * first continuation data block (dump bug?). * * The next record we want is `prevtapea + 1'. */ i = prevtapea + 1 - tmpbuf.c_tapea; dprintf(stderr, "Skipping %ld duplicate record%s.\n", i, i > 1 ? "s" : ""); while (--i >= 0) readtape(buf); } } if (curfile.action == USING) { if (volno == 1) panic("active file into volume 1\n"); return; } (void) gethead(&spcl); findinode(&spcl); if (gettingfile) { gettingfile = 0; longjmp(restart, 1); } } /* * Handle unexpected EOF. */ static void terminateinput(void) { if (gettingfile && curfile.action == USING) { printf("Warning: %s %s\n", "End-of-input encountered while extracting", curfile.name); } curfile.name = ""; curfile.action = UNKNOWN; curfile.mode = 0; curfile.ino = maxino; if (gettingfile) { gettingfile = 0; longjmp(restart, 1); } } /* * handle multiple dumps per tape by skipping forward to the * appropriate one. */ static void setdumpnum(void) { struct mtop tcom; if (dumpnum == 1 || volno != 1) return; if (pipein) { fprintf(stderr, "Cannot have multiple dumps on pipe input\n"); done(1); } tcom.mt_op = MTFSF; tcom.mt_count = dumpnum - 1; #ifdef RRESTORE if (host) rmtioctl(MTFSF, dumpnum - 1); else #endif if (!pipecmdin && ioctl(mt, MTIOCTOP, (char *)&tcom) < 0) fprintf(stderr, "ioctl MTFSF: %s\n", strerror(errno)); } void printdumpinfo(void) { time_t t; t = _time64_to_time(spcl.c_date); fprintf(stdout, "Dump date: %s", ctime(&t)); t = _time64_to_time(spcl.c_ddate); fprintf(stdout, "Dumped from: %s", (spcl.c_ddate == 0) ? "the epoch\n" : ctime(&t)); if (spcl.c_host[0] == '\0') return; fprintf(stderr, "Level %jd dump of %s on %s:%s\n", (intmax_t)spcl.c_level, spcl.c_filesys, spcl.c_host, spcl.c_dev); fprintf(stderr, "Label: %s\n", spcl.c_label); } int extractfile(char *name) { u_int flags; uid_t uid; gid_t gid; mode_t mode; int extsize; struct timespec mtimep[2], ctimep[2]; struct entry *ep; char *buf; curfile.name = name; curfile.action = USING; mtimep[0].tv_sec = curfile.atime_sec; mtimep[0].tv_nsec = curfile.atime_nsec; mtimep[1].tv_sec = curfile.mtime_sec; mtimep[1].tv_nsec = curfile.mtime_nsec; ctimep[0].tv_sec = curfile.atime_sec; ctimep[0].tv_nsec = curfile.atime_nsec; ctimep[1].tv_sec = curfile.birthtime_sec; ctimep[1].tv_nsec = curfile.birthtime_nsec; extsize = curfile.extsize; uid = getuid(); if (uid == 0) uid = curfile.uid; gid = curfile.gid; mode = curfile.mode; flags = curfile.file_flags; switch (mode & IFMT) { default: fprintf(stderr, "%s: unknown file mode 0%o\n", name, mode); skipfile(); return (FAIL); case IFSOCK: vprintf(stdout, "skipped socket %s\n", name); skipfile(); return (GOOD); case IFDIR: if (mflag) { ep = lookupname(name); if (ep == NULL || ep->e_flags & EXTRACT) panic("unextracted directory %s\n", name); skipfile(); return (GOOD); } vprintf(stdout, "extract file %s\n", name); return (genliteraldir(name, curfile.ino)); case IFLNK: lnkbuf[0] = '\0'; pathlen = 0; buf = setupextattr(extsize); getfile(xtrlnkfile, xtrattr, xtrlnkskip); if (pathlen == 0) { vprintf(stdout, "%s: zero length symbolic link (ignored)\n", name); return (GOOD); } if (linkit(lnkbuf, name, SYMLINK) == GOOD) { if (extsize > 0) set_extattr(-1, name, buf, extsize, SXA_LINK); (void) lchown(name, uid, gid); (void) lchmod(name, mode); (void) utimensat(AT_FDCWD, name, ctimep, AT_SYMLINK_NOFOLLOW); (void) utimensat(AT_FDCWD, name, mtimep, AT_SYMLINK_NOFOLLOW); (void) lchflags(name, flags); return (GOOD); } return (FAIL); case IFIFO: vprintf(stdout, "extract fifo %s\n", name); if (Nflag) { skipfile(); return (GOOD); } if (uflag) (void) unlink(name); if (mkfifo(name, 0600) < 0) { fprintf(stderr, "%s: cannot create fifo: %s\n", name, strerror(errno)); skipfile(); return (FAIL); } if (extsize == 0) { skipfile(); } else { buf = setupextattr(extsize); getfile(xtrnull, xtrattr, xtrnull); set_extattr(-1, name, buf, extsize, SXA_FILE); } (void) chown(name, uid, gid); (void) chmod(name, mode); (void) utimensat(AT_FDCWD, name, ctimep, 0); (void) utimensat(AT_FDCWD, name, mtimep, 0); (void) chflags(name, flags); return (GOOD); case IFCHR: case IFBLK: vprintf(stdout, "extract special file %s\n", name); if (Nflag) { skipfile(); return (GOOD); } if (uflag) (void) unlink(name); if (mknod(name, (mode & (IFCHR | IFBLK)) | 0600, (int)curfile.rdev) < 0) { fprintf(stderr, "%s: cannot create special file: %s\n", name, strerror(errno)); skipfile(); return (FAIL); } if (extsize == 0) { skipfile(); } else { buf = setupextattr(extsize); getfile(xtrnull, xtrattr, xtrnull); set_extattr(-1, name, buf, extsize, SXA_FILE); } (void) chown(name, uid, gid); (void) chmod(name, mode); (void) utimensat(AT_FDCWD, name, ctimep, 0); (void) utimensat(AT_FDCWD, name, mtimep, 0); (void) chflags(name, flags); return (GOOD); case IFREG: vprintf(stdout, "extract file %s\n", name); if (Nflag) { skipfile(); return (GOOD); } if (uflag) (void) unlink(name); if ((ofile = open(name, O_WRONLY | O_CREAT | O_TRUNC, 0600)) < 0) { fprintf(stderr, "%s: cannot create file: %s\n", name, strerror(errno)); skipfile(); return (FAIL); } buf = setupextattr(extsize); getfile(xtrfile, xtrattr, xtrskip); if (extsize > 0) set_extattr(ofile, name, buf, extsize, SXA_FD); (void) fchown(ofile, uid, gid); (void) fchmod(ofile, mode); (void) futimens(ofile, ctimep); (void) futimens(ofile, mtimep); (void) fchflags(ofile, flags); (void) close(ofile); return (GOOD); } /* NOTREACHED */ } /* * Set attributes on a file descriptor, link, or file. */ void set_extattr(int fd, char *name, void *buf, int size, enum set_extattr_mode mode) { struct extattr *eap, *eaend; const char *method; ssize_t res; int error; char eaname[EXTATTR_MAXNAMELEN + 1]; vprintf(stdout, "Set attributes for %s:", name); eaend = buf + size; for (eap = buf; eap < eaend; eap = EXTATTR_NEXT(eap)) { /* * Make sure this entry is complete. */ if (EXTATTR_NEXT(eap) > eaend || eap->ea_length <= 0) { dprintf(stdout, "\n\t%scorrupted", eap == buf ? "" : "remainder "); break; } if (eap->ea_namespace == EXTATTR_NAMESPACE_EMPTY) continue; snprintf(eaname, sizeof(eaname), "%.*s", (int)eap->ea_namelength, eap->ea_name); vprintf(stdout, "\n\t%s, (%d bytes), %s", namespace_names[eap->ea_namespace], eap->ea_length, eaname); /* * First we try the general attribute setting interface. * However, some attributes can only be set by root or * by using special interfaces (for example, ACLs). */ if (mode == SXA_FD) { res = extattr_set_fd(fd, eap->ea_namespace, eaname, EXTATTR_CONTENT(eap), EXTATTR_CONTENT_SIZE(eap)); method = "extattr_set_fd"; } else if (mode == SXA_LINK) { res = extattr_set_link(name, eap->ea_namespace, eaname, EXTATTR_CONTENT(eap), EXTATTR_CONTENT_SIZE(eap)); method = "extattr_set_link"; } else if (mode == SXA_FILE) { res = extattr_set_file(name, eap->ea_namespace, eaname, EXTATTR_CONTENT(eap), EXTATTR_CONTENT_SIZE(eap)); method = "extattr_set_file"; } if (res != -1) { dprintf(stdout, " (set using %s)", method); continue; } /* * If the general interface refuses to set the attribute, * then we try all the specialized interfaces that we * know about. */ if (eap->ea_namespace == EXTATTR_NAMESPACE_SYSTEM && strcmp(eaname, POSIX1E_ACL_ACCESS_EXTATTR_NAME) == 0) { if (mode == SXA_FD) { error = acl_set_fd(fd, EXTATTR_CONTENT(eap)); method = "acl_set_fd"; } else if (mode == SXA_LINK) { error = acl_set_link_np(name, ACL_TYPE_ACCESS, EXTATTR_CONTENT(eap)); method = "acl_set_link_np"; } else if (mode == SXA_FILE) { error = acl_set_file(name, ACL_TYPE_ACCESS, EXTATTR_CONTENT(eap)); method = "acl_set_file"; } if (error != -1) { dprintf(stdout, " (set using %s)", method); continue; } } if (eap->ea_namespace == EXTATTR_NAMESPACE_SYSTEM && strcmp(eaname, POSIX1E_ACL_DEFAULT_EXTATTR_NAME) == 0) { if (mode == SXA_LINK) { error = acl_set_link_np(name, ACL_TYPE_DEFAULT, EXTATTR_CONTENT(eap)); method = "acl_set_link_np"; } else { error = acl_set_file(name, ACL_TYPE_DEFAULT, EXTATTR_CONTENT(eap)); method = "acl_set_file"; } if (error != -1) { dprintf(stdout, " (set using %s)", method); continue; } } vprintf(stdout, " (unable to set)"); } vprintf(stdout, "\n"); } /* * skip over bit maps on the tape */ void skipmaps(void) { while (spcl.c_type == TS_BITS || spcl.c_type == TS_CLRI) skipfile(); } /* * skip over a file on the tape */ void skipfile(void) { curfile.action = SKIP; getfile(xtrnull, xtrnull, xtrnull); } /* * Skip a hole in an output file */ static void skiphole(void (*skip)(char *, size_t), size_t *seekpos) { char buf[MAXBSIZE]; if (*seekpos > 0) { (*skip)(buf, *seekpos); *seekpos = 0; } } /* * Extract a file from the tape. * When an allocated block is found it is passed to the fill function; * when an unallocated block (hole) is found, a zeroed buffer is passed * to the skip function. */ void getfile(void (*datafill)(char *, size_t), void (*attrfill)(char *, size_t), void (*skip)(char *, size_t)) { int i; volatile off_t size; size_t seekpos; int curblk, attrsize; void (*fillit)(char *, size_t); char buf[MAXBSIZE / TP_BSIZE][TP_BSIZE]; char junk[TP_BSIZE]; curblk = 0; size = spcl.c_size; seekpos = 0; attrsize = spcl.c_extsize; if (spcl.c_type == TS_END) panic("ran off end of tape\n"); if (spcl.c_magic != FS_UFS2_MAGIC) panic("not at beginning of a file\n"); if (!gettingfile && setjmp(restart) != 0) return; gettingfile++; fillit = datafill; if (size == 0 && attrsize > 0) { fillit = attrfill; size = attrsize; attrsize = 0; } loop: for (i = 0; i < spcl.c_count; i++) { if (!readmapflag && i > TP_NINDIR) { if (Dflag) { fprintf(stderr, "spcl.c_count = %jd\n", (intmax_t)spcl.c_count); break; } else panic("spcl.c_count = %jd\n", (intmax_t)spcl.c_count); } if (readmapflag || spcl.c_addr[i]) { readtape(&buf[curblk++][0]); if (curblk == fssize / TP_BSIZE) { skiphole(skip, &seekpos); (*fillit)((char *)buf, (long)(size > TP_BSIZE ? fssize : (curblk - 1) * TP_BSIZE + size)); curblk = 0; } } else { if (curblk > 0) { skiphole(skip, &seekpos); (*fillit)((char *)buf, (long)(size > TP_BSIZE ? curblk * TP_BSIZE : (curblk - 1) * TP_BSIZE + size)); curblk = 0; } /* * We have a block of a hole. Don't skip it * now, because there may be next adjacent * block of the hole in the file. Postpone the * seek until next file write. */ seekpos += (long)MIN(TP_BSIZE, size); } if ((size -= TP_BSIZE) <= 0) { if (size > -TP_BSIZE && curblk > 0) { skiphole(skip, &seekpos); (*fillit)((char *)buf, (long)((curblk * TP_BSIZE) + size)); curblk = 0; } if (attrsize > 0) { fillit = attrfill; size = attrsize; attrsize = 0; continue; } if (spcl.c_count - i > 1) dprintf(stdout, "skipping %d junk block(s)\n", spcl.c_count - i - 1); for (i++; i < spcl.c_count; i++) { if (!readmapflag && i > TP_NINDIR) { if (Dflag) { fprintf(stderr, "spcl.c_count = %jd\n", (intmax_t)spcl.c_count); break; } else panic("spcl.c_count = %jd\n", (intmax_t)spcl.c_count); } if (readmapflag || spcl.c_addr[i]) readtape(junk); } break; } } if (gethead(&spcl) == GOOD && size > 0) { if (spcl.c_type == TS_ADDR) goto loop; dprintf(stdout, "Missing address (header) block for %s at %ld blocks\n", curfile.name, blksread); } if (curblk > 0) panic("getfile: lost data\n"); findinode(&spcl); gettingfile = 0; } /* * These variables are shared between the next two functions. */ static int extbufsize = 0; static char *extbuf; static int extloc; /* * Allocate a buffer into which to extract extended attributes. */ static char * setupextattr(int extsize) { extloc = 0; if (extsize <= extbufsize) return (extbuf); if (extbufsize > 0) free(extbuf); if ((extbuf = malloc(extsize)) != NULL) { extbufsize = extsize; return (extbuf); } extbufsize = 0; extbuf = NULL; fprintf(stderr, "Cannot extract %d bytes %s for inode %ju, name %s\n", extsize, "of extended attributes", (uintmax_t)curfile.ino, curfile.name); return (NULL); } /* * Extract the next block of extended attributes. */ static void xtrattr(char *buf, size_t size) { if (extloc + size > extbufsize) panic("overrun attribute buffer\n"); memmove(&extbuf[extloc], buf, size); extloc += size; } /* * Write out the next block of a file. */ static void xtrfile(char *buf, size_t size) { if (Nflag) return; if (write(ofile, buf, (int) size) == -1) { fprintf(stderr, "write error extracting inode %ju, name %s\nwrite: %s\n", (uintmax_t)curfile.ino, curfile.name, strerror(errno)); } } /* * Skip over a hole in a file. */ /* ARGSUSED */ static void xtrskip(char *buf, size_t size) { if (lseek(ofile, size, SEEK_CUR) == -1) { fprintf(stderr, "seek error extracting inode %ju, name %s\nlseek: %s\n", (uintmax_t)curfile.ino, curfile.name, strerror(errno)); done(1); } } /* * Collect the next block of a symbolic link. */ static void xtrlnkfile(char *buf, size_t size) { pathlen += size; if (pathlen > MAXPATHLEN) { fprintf(stderr, "symbolic link name: %s->%s%s; too long %d\n", curfile.name, lnkbuf, buf, pathlen); done(1); } (void) strcat(lnkbuf, buf); } /* * Skip over a hole in a symbolic link (should never happen). */ /* ARGSUSED */ static void xtrlnkskip(char *buf, size_t size) { fprintf(stderr, "unallocated block in symbolic link %s\n", curfile.name); done(1); } /* * Collect the next block of a bit map. */ static void xtrmap(char *buf, size_t size) { memmove(map, buf, size); map += size; } /* * Skip over a hole in a bit map (should never happen). */ /* ARGSUSED */ static void xtrmapskip(char *buf, size_t size) { panic("hole in map\n"); map += size; } /* * Noop, when an extraction function is not needed. */ /* ARGSUSED */ void xtrnull(char *buf, size_t size) { return; } /* * Read TP_BSIZE blocks from the input. * Handle read errors, and end of media. */ static void readtape(char *buf) { long rd, newvol, i, oldnumtrec; int cnt, seek_failed; if (blkcnt + (byteslide > 0) < numtrec) { memmove(buf, &tapebuf[(blkcnt++ * TP_BSIZE) + byteslide], (long)TP_BSIZE); blksread++; tapeaddr++; return; } if (numtrec > 0) memmove(&tapebuf[-TP_BSIZE], &tapebuf[(numtrec-1) * TP_BSIZE], (long)TP_BSIZE); oldnumtrec = numtrec; for (i = 0; i < ntrec; i++) ((struct s_spcl *)&tapebuf[i * TP_BSIZE])->c_magic = 0; if (numtrec == 0) numtrec = ntrec; cnt = ntrec * TP_BSIZE; rd = 0; getmore: #ifdef RRESTORE if (host) i = rmtread(&tapebuf[rd], cnt); else #endif i = read(mt, &tapebuf[rd], cnt); /* * Check for mid-tape short read error. * If found, skip rest of buffer and start with the next. */ if (!pipein && !pipecmdin && numtrec < ntrec && i > 0) { dprintf(stdout, "mid-media short read error.\n"); numtrec = ntrec; } /* * Handle partial block read. */ if ((pipein || pipecmdin) && i == 0 && rd > 0) i = rd; else if (i > 0 && i != ntrec * TP_BSIZE) { if (pipein || pipecmdin) { rd += i; cnt -= i; if (cnt > 0) goto getmore; i = rd; } else { /* * Short read. Process the blocks read. */ if (i % TP_BSIZE != 0) vprintf(stdout, "partial block read: %ld should be %ld\n", i, ntrec * TP_BSIZE); numtrec = i / TP_BSIZE; } } /* * Handle read error. */ if (i < 0) { fprintf(stderr, "Tape read error while "); switch (curfile.action) { default: fprintf(stderr, "trying to set up tape\n"); break; case UNKNOWN: fprintf(stderr, "trying to resynchronize\n"); break; case USING: fprintf(stderr, "restoring %s\n", curfile.name); break; case SKIP: fprintf(stderr, "skipping over inode %ju\n", (uintmax_t)curfile.ino); break; } if (!yflag && !reply("continue")) done(1); i = ntrec * TP_BSIZE; memset(tapebuf, 0, i); #ifdef RRESTORE if (host) seek_failed = (rmtseek(i, 1) < 0); else #endif seek_failed = (lseek(mt, i, SEEK_CUR) == (off_t)-1); if (seek_failed) { fprintf(stderr, "continuation failed: %s\n", strerror(errno)); done(1); } } /* * Handle end of tape. */ if (i == 0) { vprintf(stdout, "End-of-tape encountered\n"); if (!pipein) { newvol = volno + 1; volno = 0; numtrec = 0; getvol(newvol); readtape(buf); return; } if (rd % TP_BSIZE != 0) panic("partial block read: %ld should be %ld\n", rd, ntrec * TP_BSIZE); terminateinput(); memmove(&tapebuf[rd], &endoftapemark, (long)TP_BSIZE); } if (oldnumtrec == 0) blkcnt = 0; else blkcnt -= oldnumtrec; memmove(buf, &tapebuf[(blkcnt++ * TP_BSIZE) + byteslide], (long)TP_BSIZE); blksread++; tapeaddr++; } static void findtapeblksize(void) { long i; for (i = 0; i < ntrec; i++) ((struct s_spcl *)&tapebuf[i * TP_BSIZE])->c_magic = 0; blkcnt = 0; #ifdef RRESTORE if (host) i = rmtread(tapebuf, ntrec * TP_BSIZE); else #endif i = read(mt, tapebuf, ntrec * TP_BSIZE); if (i <= 0) { fprintf(stderr, "tape read error: %s\n", strerror(errno)); done(1); } if (i % TP_BSIZE != 0) { fprintf(stderr, "Tape block size (%ld) %s (%d)\n", i, "is not a multiple of dump block size", TP_BSIZE); done(1); } ntrec = i / TP_BSIZE; numtrec = ntrec; vprintf(stdout, "Tape block size is %ld\n", ntrec); } void closemt(void) { if (mt < 0) return; if (pipecmdin) { pclose(popenfp); popenfp = NULL; } else #ifdef RRESTORE if (host) rmtclose(); else #endif (void) close(mt); } /* * Read the next block from the tape. * If it is not any valid header, return an error. */ static int gethead(struct s_spcl *buf) { long i; readtape((char *)buf); if (buf->c_magic != FS_UFS2_MAGIC && buf->c_magic != NFS_MAGIC) { if (buf->c_magic == OFS_MAGIC) { fprintf(stderr, "Format of dump tape is too old. Must use\n"); fprintf(stderr, "a version of restore from before 2002.\n"); return (FAIL); } if (swabl(buf->c_magic) != FS_UFS2_MAGIC && buf->c_magic != NFS_MAGIC) { if (buf->c_magic == OFS_MAGIC) { fprintf(stderr, "Format of dump tape is too old. Must use\n"); fprintf(stderr, "a version of restore from before 2002.\n"); } return (FAIL); } if (!Bcvt) { vprintf(stdout, "Note: Doing Byte swapping\n"); Bcvt = 1; } } if (checksum((int *)buf) == FAIL) return (FAIL); if (Bcvt) { swabst((u_char *)"8l4s1q8l2q17l", (u_char *)buf); swabst((u_char *)"l",(u_char *) &buf->c_level); swabst((u_char *)"2l4q",(u_char *) &buf->c_flags); } readmapflag = 0; switch (buf->c_type) { case TS_CLRI: case TS_BITS: /* * Have to patch up missing information in bit map headers */ buf->c_size = buf->c_count * TP_BSIZE; if (buf->c_count > TP_NINDIR) readmapflag = 1; else for (i = 0; i < buf->c_count; i++) buf->c_addr[i]++; /* FALL THROUGH */ case TS_TAPE: if (buf->c_magic == NFS_MAGIC && (buf->c_flags & NFS_DR_NEWINODEFMT) == 0) oldinofmt = 1; /* FALL THROUGH */ case TS_END: buf->c_inumber = 0; /* FALL THROUGH */ case TS_ADDR: case TS_INODE: /* * For old dump tapes, have to copy up old fields to * new locations. */ if (buf->c_magic == NFS_MAGIC) { buf->c_tapea = buf->c_old_tapea; buf->c_firstrec = buf->c_old_firstrec; buf->c_date = _time32_to_time(buf->c_old_date); buf->c_ddate = _time32_to_time(buf->c_old_ddate); buf->c_atime = _time32_to_time(buf->c_old_atime); buf->c_mtime = _time32_to_time(buf->c_old_mtime); buf->c_birthtime = 0; buf->c_birthtimensec = 0; buf->c_extsize = 0; } break; default: panic("gethead: unknown inode type %d\n", buf->c_type); break; } if (dumpdate != 0 && _time64_to_time(buf->c_date) != dumpdate) fprintf(stderr, "Header with wrong dumpdate.\n"); /* * If we're restoring a filesystem with the old (FreeBSD 1) * format inodes, copy the uid/gid to the new location */ if (oldinofmt) { buf->c_uid = buf->c_spare1[1]; buf->c_gid = buf->c_spare1[2]; } buf->c_magic = FS_UFS2_MAGIC; tapeaddr = buf->c_tapea; if (dflag) accthdr(buf); return(GOOD); } /* * Check that a header is where it belongs and predict the next header */ static void accthdr(struct s_spcl *header) { static ino_t previno = 0x7fffffff; static int prevtype; static long predict; long blks, i; if (header->c_type == TS_TAPE) { fprintf(stderr, "Volume header "); if (header->c_firstrec) fprintf(stderr, "begins with record %jd", (intmax_t)header->c_firstrec); fprintf(stderr, "\n"); previno = 0x7fffffff; return; } if (previno == 0x7fffffff) goto newcalc; switch (prevtype) { case TS_BITS: fprintf(stderr, "Dumped inodes map header"); break; case TS_CLRI: fprintf(stderr, "Used inodes map header"); break; case TS_INODE: fprintf(stderr, "File header, ino %ju", (uintmax_t)previno); break; case TS_ADDR: fprintf(stderr, "File continuation header, ino %ju", (uintmax_t)previno); break; case TS_END: fprintf(stderr, "End of tape header"); break; } if (predict != blksread - 1) fprintf(stderr, "; predicted %ld blocks, got %ld blocks", predict, blksread - 1); fprintf(stderr, "\n"); newcalc: blks = 0; if (header->c_type != TS_END) for (i = 0; i < header->c_count; i++) if (readmapflag || header->c_addr[i] != 0) blks++; predict = blks; blksread = 0; prevtype = header->c_type; previno = header->c_inumber; } /* * Find an inode header. * Complain if had to skip. */ static void findinode(struct s_spcl *header) { static long skipcnt = 0; long i; char buf[TP_BSIZE]; int htype; curfile.name = ""; curfile.action = UNKNOWN; curfile.mode = 0; curfile.ino = 0; do { htype = header->c_type; switch (htype) { case TS_ADDR: /* * Skip up to the beginning of the next record */ for (i = 0; i < header->c_count; i++) if (header->c_addr[i]) readtape(buf); while (gethead(header) == FAIL || _time64_to_time(header->c_date) != dumpdate) { skipcnt++; if (Dflag) { byteslide++; if (byteslide < TP_BSIZE) { blkcnt--; blksread--; } else byteslide = 0; } } break; case TS_INODE: curfile.mode = header->c_mode; curfile.uid = header->c_uid; curfile.gid = header->c_gid; curfile.file_flags = header->c_file_flags; curfile.rdev = header->c_rdev; curfile.atime_sec = header->c_atime; curfile.atime_nsec = header->c_atimensec; curfile.mtime_sec = header->c_mtime; curfile.mtime_nsec = header->c_mtimensec; curfile.birthtime_sec = header->c_birthtime; curfile.birthtime_nsec = header->c_birthtimensec; curfile.extsize = header->c_extsize; curfile.size = header->c_size; curfile.ino = header->c_inumber; break; case TS_END: /* If we missed some tapes, get another volume. */ if (tapesread & (tapesread + 1)) { getvol(0); continue; } curfile.ino = maxino; break; case TS_CLRI: curfile.name = ""; break; case TS_BITS: curfile.name = ""; break; case TS_TAPE: if (Dflag) fprintf(stderr, "unexpected tape header\n"); else panic("unexpected tape header\n"); default: if (Dflag) fprintf(stderr, "unknown tape header type %d\n", spcl.c_type); else panic("unknown tape header type %d\n", spcl.c_type); while (gethead(header) == FAIL || _time64_to_time(header->c_date) != dumpdate) { skipcnt++; if (Dflag) { byteslide++; if (byteslide < TP_BSIZE) { blkcnt--; blksread--; } else byteslide = 0; } } } } while (htype == TS_ADDR); if (skipcnt > 0) fprintf(stderr, "resync restore, skipped %ld %s\n", skipcnt, Dflag ? "bytes" : "blocks"); skipcnt = 0; } static int checksum(int *buf) { int i, j; j = sizeof(union u_spcl) / sizeof(int); i = 0; if (!Bcvt) { do i += *buf++; while (--j); } else { /* What happens if we want to read restore tapes for a 16bit int machine??? */ do i += swabl(*buf++); while (--j); } if (i != CHECKSUM) { fprintf(stderr, "Checksum error %o, inode %ju file %s\n", i, (uintmax_t)curfile.ino, curfile.name); return(FAIL); } return(GOOD); } #ifdef RRESTORE #include void msg(const char *fmt, ...) { va_list ap; va_start(ap, fmt); (void)vfprintf(stderr, fmt, ap); va_end(ap); } #endif /* RRESTORE */ static u_char * swabshort(u_char *sp, int n) { char c; while (--n >= 0) { c = sp[0]; sp[0] = sp[1]; sp[1] = c; sp += 2; } return (sp); } static u_char * swablong(u_char *sp, int n) { char c; while (--n >= 0) { c = sp[0]; sp[0] = sp[3]; sp[3] = c; c = sp[2]; sp[2] = sp[1]; sp[1] = c; sp += 4; } return (sp); } static u_char * swabquad(u_char *sp, int n) { char c; while (--n >= 0) { c = sp[0]; sp[0] = sp[7]; sp[7] = c; c = sp[1]; sp[1] = sp[6]; sp[6] = c; c = sp[2]; sp[2] = sp[5]; sp[5] = c; c = sp[3]; sp[3] = sp[4]; sp[4] = c; sp += 8; } return (sp); } void swabst(u_char *cp, u_char *sp) { int n = 0; while (*cp) { switch (*cp) { case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': n = (n * 10) + (*cp++ - '0'); continue; case 's': case 'w': case 'h': if (n == 0) n = 1; sp = swabshort(sp, n); break; case 'l': if (n == 0) n = 1; sp = swablong(sp, n); break; case 'q': if (n == 0) n = 1; sp = swabquad(sp, n); break; case 'b': if (n == 0) n = 1; sp += n; break; default: fprintf(stderr, "Unknown conversion character: %c\n", *cp); done(0); break; } cp++; n = 0; } } static u_long swabl(u_long x) { swabst((u_char *)"l", (u_char *)&x); return (x); } Index: head/sbin/restore/utilities.c =================================================================== --- head/sbin/restore/utilities.c (revision 313779) +++ head/sbin/restore/utilities.c (revision 313780) @@ -1,422 +1,422 @@ /* * Copyright (c) 1983, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #ifndef lint #if 0 static char sccsid[] = "@(#)utilities.c 8.5 (Berkeley) 4/28/95"; #endif static const char rcsid[] = "$FreeBSD$"; #endif /* not lint */ #include #include #include #include #include #include #include #include #include #include #include "restore.h" #include "extern.h" /* * Insure that all the components of a pathname exist. */ void pathcheck(char *name) { char *cp; struct entry *ep; char *start; start = strchr(name, '/'); if (start == NULL) return; for (cp = start; *cp != '\0'; cp++) { if (*cp != '/') continue; *cp = '\0'; ep = lookupname(name); if (ep == NULL) { /* Safe; we know the pathname exists in the dump. */ ep = addentry(name, pathsearch(name)->d_ino, NODE); newnode(ep); } ep->e_flags |= NEW|KEEP; *cp = '/'; } } /* * Change a name to a unique temporary name. */ void mktempname(struct entry *ep) { char oldname[MAXPATHLEN]; if (ep->e_flags & TMPNAME) badentry(ep, "mktempname: called with TMPNAME"); ep->e_flags |= TMPNAME; (void) strcpy(oldname, myname(ep)); freename(ep->e_name); ep->e_name = savename(gentempname(ep)); ep->e_namlen = strlen(ep->e_name); renameit(oldname, myname(ep)); } /* * Generate a temporary name for an entry. */ char * gentempname(struct entry *ep) { static char name[MAXPATHLEN]; struct entry *np; long i = 0; for (np = lookupino(ep->e_ino); np != NULL && np != ep; np = np->e_links) i++; if (np == NULL) badentry(ep, "not on ino list"); (void) sprintf(name, "%s%ld%lu", TMPHDR, i, (u_long)ep->e_ino); return (name); } /* * Rename a file or directory. */ void renameit(char *from, char *to) { if (!Nflag && rename(from, to) < 0) { fprintf(stderr, "warning: cannot rename %s to %s: %s\n", from, to, strerror(errno)); return; } vprintf(stdout, "rename %s to %s\n", from, to); } /* * Create a new node (directory). */ void newnode(struct entry *np) { char *cp; if (np->e_type != NODE) badentry(np, "newnode: not a node"); cp = myname(np); if (!Nflag && mkdir(cp, 0777) < 0 && !uflag) { np->e_flags |= EXISTED; fprintf(stderr, "warning: %s: %s\n", cp, strerror(errno)); return; } vprintf(stdout, "Make node %s\n", cp); } /* * Remove an old node (directory). */ void removenode(struct entry *ep) { char *cp; if (ep->e_type != NODE) badentry(ep, "removenode: not a node"); if (ep->e_entries != NULL) badentry(ep, "removenode: non-empty directory"); ep->e_flags |= REMOVED; ep->e_flags &= ~TMPNAME; cp = myname(ep); if (!Nflag && rmdir(cp) < 0) { fprintf(stderr, "warning: %s: %s\n", cp, strerror(errno)); return; } vprintf(stdout, "Remove node %s\n", cp); } /* * Remove a leaf. */ void removeleaf(struct entry *ep) { char *cp; if (ep->e_type != LEAF) badentry(ep, "removeleaf: not a leaf"); ep->e_flags |= REMOVED; ep->e_flags &= ~TMPNAME; cp = myname(ep); if (!Nflag && unlink(cp) < 0) { fprintf(stderr, "warning: %s: %s\n", cp, strerror(errno)); return; } vprintf(stdout, "Remove leaf %s\n", cp); } /* * Create a link. */ int linkit(char *existing, char *new, int type) { /* if we want to unlink first, do it now so *link() won't fail */ if (uflag && !Nflag) (void)unlink(new); if (type == SYMLINK) { if (!Nflag && symlink(existing, new) < 0) { fprintf(stderr, "warning: cannot create symbolic link %s->%s: %s\n", new, existing, strerror(errno)); return (FAIL); } } else if (type == HARDLINK) { int ret; if (!Nflag && (ret = link(existing, new)) < 0) { struct stat s; /* * Most likely, the schg flag is set. Clear the * flags and try again. */ if (stat(existing, &s) == 0 && s.st_flags != 0 && chflags(existing, 0) == 0) { ret = link(existing, new); chflags(existing, s.st_flags); } if (ret < 0) { fprintf(stderr, "warning: cannot create " "hard link %s->%s: %s\n", new, existing, strerror(errno)); return (FAIL); } } } else { panic("linkit: unknown type %d\n", type); return (FAIL); } vprintf(stdout, "Create %s link %s->%s\n", type == SYMLINK ? "symbolic" : "hard", new, existing); return (GOOD); } /* * Create a whiteout. */ int addwhiteout(char *name) { if (!Nflag && mknod(name, S_IFWHT, 0) < 0) { fprintf(stderr, "warning: cannot create whiteout %s: %s\n", name, strerror(errno)); return (FAIL); } vprintf(stdout, "Create whiteout %s\n", name); return (GOOD); } /* * Delete a whiteout. */ void delwhiteout(struct entry *ep) { char *name; if (ep->e_type != LEAF) badentry(ep, "delwhiteout: not a leaf"); ep->e_flags |= REMOVED; ep->e_flags &= ~TMPNAME; name = myname(ep); if (!Nflag && undelete(name) < 0) { fprintf(stderr, "warning: cannot delete whiteout %s: %s\n", name, strerror(errno)); return; } vprintf(stdout, "Delete whiteout %s\n", name); } /* * find lowest number file (above "start") that needs to be extracted */ ino_t lowerbnd(ino_t start) { struct entry *ep; for ( ; start < maxino; start++) { ep = lookupino(start); if (ep == NULL || ep->e_type == NODE) continue; if (ep->e_flags & (NEW|EXTRACT)) return (start); } return (start); } /* * find highest number file (below "start") that needs to be extracted */ ino_t upperbnd(ino_t start) { struct entry *ep; - for ( ; start > ROOTINO; start--) { + for ( ; start > UFS_ROOTINO; start--) { ep = lookupino(start); if (ep == NULL || ep->e_type == NODE) continue; if (ep->e_flags & (NEW|EXTRACT)) return (start); } return (start); } /* * report on a badly formed entry */ void badentry(struct entry *ep, char *msg) { fprintf(stderr, "bad entry: %s\n", msg); fprintf(stderr, "name: %s\n", myname(ep)); fprintf(stderr, "parent name %s\n", myname(ep->e_parent)); if (ep->e_sibling != NULL) fprintf(stderr, "sibling name: %s\n", myname(ep->e_sibling)); if (ep->e_entries != NULL) fprintf(stderr, "next entry name: %s\n", myname(ep->e_entries)); if (ep->e_links != NULL) fprintf(stderr, "next link name: %s\n", myname(ep->e_links)); if (ep->e_next != NULL) fprintf(stderr, "next hashchain name: %s\n", myname(ep->e_next)); fprintf(stderr, "entry type: %s\n", ep->e_type == NODE ? "NODE" : "LEAF"); fprintf(stderr, "inode number: %lu\n", (u_long)ep->e_ino); panic("flags: %s\n", flagvalues(ep)); } /* * Construct a string indicating the active flag bits of an entry. */ char * flagvalues(struct entry *ep) { static char flagbuf[BUFSIZ]; (void) strcpy(flagbuf, "|NIL"); flagbuf[0] = '\0'; if (ep->e_flags & REMOVED) (void) strcat(flagbuf, "|REMOVED"); if (ep->e_flags & TMPNAME) (void) strcat(flagbuf, "|TMPNAME"); if (ep->e_flags & EXTRACT) (void) strcat(flagbuf, "|EXTRACT"); if (ep->e_flags & NEW) (void) strcat(flagbuf, "|NEW"); if (ep->e_flags & KEEP) (void) strcat(flagbuf, "|KEEP"); if (ep->e_flags & EXISTED) (void) strcat(flagbuf, "|EXISTED"); return (&flagbuf[1]); } /* * Check to see if a name is on a dump tape. */ ino_t dirlookup(const char *name) { struct direct *dp; ino_t ino; ino = ((dp = pathsearch(name)) == NULL) ? 0 : dp->d_ino; if (ino == 0 || TSTINO(ino, dumpmap) == 0) fprintf(stderr, "%s is not on the tape\n", name); return (ino); } /* * Elicit a reply. */ int reply(char *question) { int c; do { fprintf(stderr, "%s? [yn] ", question); (void) fflush(stderr); c = getc(terminal); while (c != '\n' && getc(terminal) != '\n') if (c == EOF) return (FAIL); } while (c != 'y' && c != 'n'); if (c == 'y') return (GOOD); return (FAIL); } /* * handle unexpected inconsistencies */ #include void panic(const char *fmt, ...) { va_list ap; va_start(ap, fmt); vfprintf(stderr, fmt, ap); va_end(ap); if (yflag) return; if (reply("abort") == GOOD) { if (reply("dump core") == GOOD) abort(); done(1); } } Index: head/sbin/tunefs/tunefs.c =================================================================== --- head/sbin/tunefs/tunefs.c (revision 313779) +++ head/sbin/tunefs/tunefs.c (revision 313780) @@ -1,1144 +1,1144 @@ /* * Copyright (c) 1983, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #if 0 #ifndef lint static const char copyright[] = "@(#) Copyright (c) 1983, 1993\n\ The Regents of the University of California. All rights reserved.\n"; #endif /* not lint */ #ifndef lint static char sccsid[] = "@(#)tunefs.c 8.2 (Berkeley) 4/19/94"; #endif /* not lint */ #endif #include __FBSDID("$FreeBSD$"); /* * tunefs: change layout parameters to an existing file system. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* the optimization warning string template */ #define OPTWARN "should optimize for %s with minfree %s %d%%" static int blocks; static char clrbuf[MAXBSIZE]; static struct uufsd disk; #define sblock disk.d_fs static void usage(void); static void printfs(void); static int journal_alloc(int64_t size); static void journal_clear(void); static void sbdirty(void); int main(int argc, char *argv[]) { const char *avalue, *jvalue, *Jvalue, *Lvalue, *lvalue, *Nvalue, *nvalue; const char *tvalue; const char *special, *on; const char *name; int active; int Aflag, aflag, eflag, evalue, fflag, fvalue, jflag, Jflag, kflag; int kvalue, Lflag, lflag, mflag, mvalue, Nflag, nflag, oflag, ovalue; int pflag, sflag, svalue, Svalue, tflag; int ch, found_arg, i; int iovlen = 0; const char *chg[2]; struct statfs stfs; struct iovec *iov = NULL; char errmsg[255] = {0}; if (argc < 3) usage(); Aflag = aflag = eflag = fflag = jflag = Jflag = kflag = Lflag = 0; lflag = mflag = Nflag = nflag = oflag = pflag = sflag = tflag = 0; avalue = jvalue = Jvalue = Lvalue = lvalue = Nvalue = nvalue = NULL; evalue = fvalue = mvalue = ovalue = svalue = Svalue = 0; active = 0; found_arg = 0; /* At least one arg is required. */ while ((ch = getopt(argc, argv, "Aa:e:f:j:J:k:L:l:m:N:n:o:ps:S:t:")) != -1) switch (ch) { case 'A': found_arg = 1; Aflag++; break; case 'a': found_arg = 1; name = "POSIX.1e ACLs"; avalue = optarg; if (strcmp(avalue, "enable") && strcmp(avalue, "disable")) { errx(10, "bad %s (options are %s)", name, "`enable' or `disable'"); } aflag = 1; break; case 'e': found_arg = 1; name = "maximum blocks per file in a cylinder group"; evalue = atoi(optarg); if (evalue < 1) errx(10, "%s must be >= 1 (was %s)", name, optarg); eflag = 1; break; case 'f': found_arg = 1; name = "average file size"; fvalue = atoi(optarg); if (fvalue < 1) errx(10, "%s must be >= 1 (was %s)", name, optarg); fflag = 1; break; case 'j': found_arg = 1; name = "softdep journaled file system"; jvalue = optarg; if (strcmp(jvalue, "enable") && strcmp(jvalue, "disable")) { errx(10, "bad %s (options are %s)", name, "`enable' or `disable'"); } jflag = 1; break; case 'J': found_arg = 1; name = "gjournaled file system"; Jvalue = optarg; if (strcmp(Jvalue, "enable") && strcmp(Jvalue, "disable")) { errx(10, "bad %s (options are %s)", name, "`enable' or `disable'"); } Jflag = 1; break; case 'k': found_arg = 1; name = "space to hold for metadata blocks"; kvalue = atoi(optarg); if (kvalue < 0) errx(10, "bad %s (%s)", name, optarg); kflag = 1; break; case 'L': found_arg = 1; name = "volume label"; Lvalue = optarg; i = -1; while (isalnum(Lvalue[++i])); if (Lvalue[i] != '\0') { errx(10, "bad %s. Valid characters are alphanumerics.", name); } if (strlen(Lvalue) >= MAXVOLLEN) { errx(10, "bad %s. Length is longer than %d.", name, MAXVOLLEN - 1); } Lflag = 1; break; case 'l': found_arg = 1; name = "multilabel MAC file system"; lvalue = optarg; if (strcmp(lvalue, "enable") && strcmp(lvalue, "disable")) { errx(10, "bad %s (options are %s)", name, "`enable' or `disable'"); } lflag = 1; break; case 'm': found_arg = 1; name = "minimum percentage of free space"; mvalue = atoi(optarg); if (mvalue < 0 || mvalue > 99) errx(10, "bad %s (%s)", name, optarg); mflag = 1; break; case 'N': found_arg = 1; name = "NFSv4 ACLs"; Nvalue = optarg; if (strcmp(Nvalue, "enable") && strcmp(Nvalue, "disable")) { errx(10, "bad %s (options are %s)", name, "`enable' or `disable'"); } Nflag = 1; break; case 'n': found_arg = 1; name = "soft updates"; nvalue = optarg; if (strcmp(nvalue, "enable") != 0 && strcmp(nvalue, "disable") != 0) { errx(10, "bad %s (options are %s)", name, "`enable' or `disable'"); } nflag = 1; break; case 'o': found_arg = 1; name = "optimization preference"; if (strcmp(optarg, "space") == 0) ovalue = FS_OPTSPACE; else if (strcmp(optarg, "time") == 0) ovalue = FS_OPTTIME; else errx(10, "bad %s (options are `space' or `time')", name); oflag = 1; break; case 'p': found_arg = 1; pflag = 1; break; case 's': found_arg = 1; name = "expected number of files per directory"; svalue = atoi(optarg); if (svalue < 1) errx(10, "%s must be >= 1 (was %s)", name, optarg); sflag = 1; break; case 'S': found_arg = 1; name = "Softdep Journal Size"; Svalue = atoi(optarg); if (Svalue < SUJ_MIN) errx(10, "%s must be >= %d (was %s)", name, SUJ_MIN, optarg); break; case 't': found_arg = 1; name = "trim"; tvalue = optarg; if (strcmp(tvalue, "enable") != 0 && strcmp(tvalue, "disable") != 0) { errx(10, "bad %s (options are %s)", name, "`enable' or `disable'"); } tflag = 1; break; default: usage(); } argc -= optind; argv += optind; if (found_arg == 0 || argc != 1) usage(); on = special = argv[0]; if (ufs_disk_fillout(&disk, special) == -1) goto err; if (disk.d_name != special) { if (statfs(special, &stfs) != 0) warn("Can't stat %s", special); if (strcmp(special, stfs.f_mntonname) == 0) active = 1; } if (pflag) { printfs(); exit(0); } if (Lflag) { name = "volume label"; strncpy(sblock.fs_volname, Lvalue, MAXVOLLEN); } if (aflag) { name = "POSIX.1e ACLs"; if (strcmp(avalue, "enable") == 0) { if (sblock.fs_flags & FS_ACLS) { warnx("%s remains unchanged as enabled", name); } else if (sblock.fs_flags & FS_NFS4ACLS) { warnx("%s and NFSv4 ACLs are mutually " "exclusive", name); } else { sblock.fs_flags |= FS_ACLS; warnx("%s set", name); } } else if (strcmp(avalue, "disable") == 0) { if ((~sblock.fs_flags & FS_ACLS) == FS_ACLS) { warnx("%s remains unchanged as disabled", name); } else { sblock.fs_flags &= ~FS_ACLS; warnx("%s cleared", name); } } } if (eflag) { name = "maximum blocks per file in a cylinder group"; if (sblock.fs_maxbpg == evalue) warnx("%s remains unchanged as %d", name, evalue); else { warnx("%s changes from %d to %d", name, sblock.fs_maxbpg, evalue); sblock.fs_maxbpg = evalue; } } if (fflag) { name = "average file size"; if (sblock.fs_avgfilesize == (unsigned)fvalue) { warnx("%s remains unchanged as %d", name, fvalue); } else { warnx("%s changes from %d to %d", name, sblock.fs_avgfilesize, fvalue); sblock.fs_avgfilesize = fvalue; } } if (jflag) { name = "soft updates journaling"; if (strcmp(jvalue, "enable") == 0) { if ((sblock.fs_flags & (FS_DOSOFTDEP | FS_SUJ)) == (FS_DOSOFTDEP | FS_SUJ)) { warnx("%s remains unchanged as enabled", name); } else if (sblock.fs_clean == 0) { warnx("%s cannot be enabled until fsck is run", name); } else if (journal_alloc(Svalue) != 0) { warnx("%s can not be enabled", name); } else { sblock.fs_flags |= FS_DOSOFTDEP | FS_SUJ; warnx("%s set", name); } } else if (strcmp(jvalue, "disable") == 0) { if ((~sblock.fs_flags & FS_SUJ) == FS_SUJ) { warnx("%s remains unchanged as disabled", name); } else { journal_clear(); sblock.fs_flags &= ~FS_SUJ; sblock.fs_sujfree = 0; warnx("%s cleared but soft updates still set.", name); warnx("remove .sujournal to reclaim space"); } } } if (Jflag) { name = "gjournal"; if (strcmp(Jvalue, "enable") == 0) { if (sblock.fs_flags & FS_GJOURNAL) { warnx("%s remains unchanged as enabled", name); } else { sblock.fs_flags |= FS_GJOURNAL; warnx("%s set", name); } } else if (strcmp(Jvalue, "disable") == 0) { if ((~sblock.fs_flags & FS_GJOURNAL) == FS_GJOURNAL) { warnx("%s remains unchanged as disabled", name); } else { sblock.fs_flags &= ~FS_GJOURNAL; warnx("%s cleared", name); } } } if (kflag) { name = "space to hold for metadata blocks"; if (sblock.fs_metaspace == kvalue) warnx("%s remains unchanged as %d", name, kvalue); else { kvalue = blknum(&sblock, kvalue); if (kvalue > sblock.fs_fpg / 2) { kvalue = blknum(&sblock, sblock.fs_fpg / 2); warnx("%s cannot exceed half the file system " "space", name); } warnx("%s changes from %jd to %d", name, sblock.fs_metaspace, kvalue); sblock.fs_metaspace = kvalue; } } if (lflag) { name = "multilabel"; if (strcmp(lvalue, "enable") == 0) { if (sblock.fs_flags & FS_MULTILABEL) { warnx("%s remains unchanged as enabled", name); } else { sblock.fs_flags |= FS_MULTILABEL; warnx("%s set", name); } } else if (strcmp(lvalue, "disable") == 0) { if ((~sblock.fs_flags & FS_MULTILABEL) == FS_MULTILABEL) { warnx("%s remains unchanged as disabled", name); } else { sblock.fs_flags &= ~FS_MULTILABEL; warnx("%s cleared", name); } } } if (mflag) { name = "minimum percentage of free space"; if (sblock.fs_minfree == mvalue) warnx("%s remains unchanged as %d%%", name, mvalue); else { warnx("%s changes from %d%% to %d%%", name, sblock.fs_minfree, mvalue); sblock.fs_minfree = mvalue; if (mvalue >= MINFREE && sblock.fs_optim == FS_OPTSPACE) warnx(OPTWARN, "time", ">=", MINFREE); if (mvalue < MINFREE && sblock.fs_optim == FS_OPTTIME) warnx(OPTWARN, "space", "<", MINFREE); } } if (Nflag) { name = "NFSv4 ACLs"; if (strcmp(Nvalue, "enable") == 0) { if (sblock.fs_flags & FS_NFS4ACLS) { warnx("%s remains unchanged as enabled", name); } else if (sblock.fs_flags & FS_ACLS) { warnx("%s and POSIX.1e ACLs are mutually " "exclusive", name); } else { sblock.fs_flags |= FS_NFS4ACLS; warnx("%s set", name); } } else if (strcmp(Nvalue, "disable") == 0) { if ((~sblock.fs_flags & FS_NFS4ACLS) == FS_NFS4ACLS) { warnx("%s remains unchanged as disabled", name); } else { sblock.fs_flags &= ~FS_NFS4ACLS; warnx("%s cleared", name); } } } if (nflag) { name = "soft updates"; if (strcmp(nvalue, "enable") == 0) { if (sblock.fs_flags & FS_DOSOFTDEP) warnx("%s remains unchanged as enabled", name); else if (sblock.fs_clean == 0) { warnx("%s cannot be enabled until fsck is run", name); } else { sblock.fs_flags |= FS_DOSOFTDEP; warnx("%s set", name); } } else if (strcmp(nvalue, "disable") == 0) { if ((~sblock.fs_flags & FS_DOSOFTDEP) == FS_DOSOFTDEP) warnx("%s remains unchanged as disabled", name); else { sblock.fs_flags &= ~FS_DOSOFTDEP; warnx("%s cleared", name); } } } if (oflag) { name = "optimization preference"; chg[FS_OPTSPACE] = "space"; chg[FS_OPTTIME] = "time"; if (sblock.fs_optim == ovalue) warnx("%s remains unchanged as %s", name, chg[ovalue]); else { warnx("%s changes from %s to %s", name, chg[sblock.fs_optim], chg[ovalue]); sblock.fs_optim = ovalue; if (sblock.fs_minfree >= MINFREE && ovalue == FS_OPTSPACE) warnx(OPTWARN, "time", ">=", MINFREE); if (sblock.fs_minfree < MINFREE && ovalue == FS_OPTTIME) warnx(OPTWARN, "space", "<", MINFREE); } } if (sflag) { name = "expected number of files per directory"; if (sblock.fs_avgfpdir == (unsigned)svalue) { warnx("%s remains unchanged as %d", name, svalue); } else { warnx("%s changes from %d to %d", name, sblock.fs_avgfpdir, svalue); sblock.fs_avgfpdir = svalue; } } if (tflag) { name = "issue TRIM to the disk"; if (strcmp(tvalue, "enable") == 0) { if (sblock.fs_flags & FS_TRIM) warnx("%s remains unchanged as enabled", name); else { sblock.fs_flags |= FS_TRIM; warnx("%s set", name); } } else if (strcmp(tvalue, "disable") == 0) { if ((~sblock.fs_flags & FS_TRIM) == FS_TRIM) warnx("%s remains unchanged as disabled", name); else { sblock.fs_flags &= ~FS_TRIM; warnx("%s cleared", name); } } } if (sbwrite(&disk, Aflag) == -1) goto err; ufs_disk_close(&disk); if (active) { build_iovec_argf(&iov, &iovlen, "fstype", "ufs"); build_iovec_argf(&iov, &iovlen, "fspath", "%s", on); build_iovec(&iov, &iovlen, "errmsg", errmsg, sizeof(errmsg)); if (nmount(iov, iovlen, stfs.f_flags | MNT_UPDATE | MNT_RELOAD) < 0) { if (errmsg[0]) err(9, "%s: reload: %s", special, errmsg); else err(9, "%s: reload", special); } warnx("file system reloaded"); } exit(0); err: if (disk.d_error != NULL) errx(11, "%s: %s", special, disk.d_error); else err(12, "%s", special); } static void sbdirty(void) { disk.d_fs.fs_flags |= FS_UNCLEAN | FS_NEEDSFSCK; disk.d_fs.fs_clean = 0; } static ufs2_daddr_t journal_balloc(void) { ufs2_daddr_t blk; struct cg *cgp; int valid; static int contig = 1; cgp = &disk.d_cg; for (;;) { blk = cgballoc(&disk); if (blk > 0) break; /* * If we failed to allocate a block from this cg, move to * the next. */ if (cgwrite(&disk) < 0) { warn("Failed to write updated cg"); return (-1); } while ((valid = cgread(&disk)) == 1) { /* * Try to minimize fragmentation by requiring a minimum * number of blocks present. */ if (cgp->cg_cs.cs_nbfree > 256 * 1024) break; if (contig == 0 && cgp->cg_cs.cs_nbfree) break; } if (valid) continue; /* * Try once through looking only for large contiguous regions * and again taking any space we can find. */ if (contig) { contig = 0; disk.d_ccg = 0; warnx("Journal file fragmented."); continue; } warnx("Failed to find sufficient free blocks for the journal"); return -1; } if (bwrite(&disk, fsbtodb(&sblock, blk), clrbuf, sblock.fs_bsize) <= 0) { warn("Failed to initialize new block"); return -1; } return (blk); } /* * Search a directory block for the SUJ_FILE. */ static ino_t dir_search(ufs2_daddr_t blk, int bytes) { char block[MAXBSIZE]; struct direct *dp; int off; if (bread(&disk, fsbtodb(&sblock, blk), block, bytes) <= 0) { warn("Failed to read dir block"); return (-1); } for (off = 0; off < bytes; off += dp->d_reclen) { dp = (struct direct *)&block[off]; if (dp->d_reclen == 0) break; if (dp->d_ino == 0) continue; if (dp->d_namlen != strlen(SUJ_FILE)) continue; if (bcmp(dp->d_name, SUJ_FILE, dp->d_namlen) != 0) continue; return (dp->d_ino); } return (0); } /* - * Search in the ROOTINO for the SUJ_FILE. If it exists we can not enable + * Search in the UFS_ROOTINO for the SUJ_FILE. If it exists we can not enable * journaling. */ static ino_t journal_findfile(void) { struct ufs1_dinode *dp1; struct ufs2_dinode *dp2; ino_t ino; int mode; void *ip; int i; - if (getino(&disk, &ip, ROOTINO, &mode) != 0) { + if (getino(&disk, &ip, UFS_ROOTINO, &mode) != 0) { warn("Failed to get root inode"); return (-1); } dp2 = ip; dp1 = ip; if (sblock.fs_magic == FS_UFS1_MAGIC) { - if ((off_t)dp1->di_size >= lblktosize(&sblock, NDADDR)) { - warnx("ROOTINO extends beyond direct blocks."); + if ((off_t)dp1->di_size >= lblktosize(&sblock, UFS_NDADDR)) { + warnx("UFS_ROOTINO extends beyond direct blocks."); return (-1); } - for (i = 0; i < NDADDR; i++) { + for (i = 0; i < UFS_NDADDR; i++) { if (dp1->di_db[i] == 0) break; if ((ino = dir_search(dp1->di_db[i], sblksize(&sblock, (off_t)dp1->di_size, i))) != 0) return (ino); } } else { - if ((off_t)dp2->di_size >= lblktosize(&sblock, NDADDR)) { - warnx("ROOTINO extends beyond direct blocks."); + if ((off_t)dp2->di_size >= lblktosize(&sblock, UFS_NDADDR)) { + warnx("UFS_ROOTINO extends beyond direct blocks."); return (-1); } - for (i = 0; i < NDADDR; i++) { + for (i = 0; i < UFS_NDADDR; i++) { if (dp2->di_db[i] == 0) break; if ((ino = dir_search(dp2->di_db[i], sblksize(&sblock, (off_t)dp2->di_size, i))) != 0) return (ino); } } return (0); } static void dir_clear_block(const char *block, off_t off) { struct direct *dp; for (; off < sblock.fs_bsize; off += DIRBLKSIZ) { dp = (struct direct *)&block[off]; dp->d_ino = 0; dp->d_reclen = DIRBLKSIZ; dp->d_type = DT_UNKNOWN; } } /* * Insert the journal at inode 'ino' into directory blk 'blk' at the first * free offset of 'off'. DIRBLKSIZ blocks after off are initialized as * empty. */ static int dir_insert(ufs2_daddr_t blk, off_t off, ino_t ino) { struct direct *dp; char block[MAXBSIZE]; if (bread(&disk, fsbtodb(&sblock, blk), block, sblock.fs_bsize) <= 0) { warn("Failed to read dir block"); return (-1); } bzero(&block[off], sblock.fs_bsize - off); dp = (struct direct *)&block[off]; dp->d_ino = ino; dp->d_reclen = DIRBLKSIZ; dp->d_type = DT_REG; dp->d_namlen = strlen(SUJ_FILE); bcopy(SUJ_FILE, &dp->d_name, strlen(SUJ_FILE)); dir_clear_block(block, off + DIRBLKSIZ); if (bwrite(&disk, fsbtodb(&sblock, blk), block, sblock.fs_bsize) <= 0) { warn("Failed to write dir block"); return (-1); } return (0); } /* * Extend a directory block in 'blk' by copying it to a full size block * and inserting the new journal inode into .sujournal. */ static int dir_extend(ufs2_daddr_t blk, ufs2_daddr_t nblk, off_t size, ino_t ino) { char block[MAXBSIZE]; if (bread(&disk, fsbtodb(&sblock, blk), block, roundup(size, sblock.fs_fsize)) <= 0) { warn("Failed to read dir block"); return (-1); } dir_clear_block(block, size); if (bwrite(&disk, fsbtodb(&sblock, nblk), block, sblock.fs_bsize) <= 0) { warn("Failed to write dir block"); return (-1); } return (dir_insert(nblk, size, ino)); } /* - * Insert the journal file into the ROOTINO directory. We always extend the + * Insert the journal file into the UFS_ROOTINO directory. We always extend the * last frag */ static int journal_insertfile(ino_t ino) { struct ufs1_dinode *dp1; struct ufs2_dinode *dp2; void *ip; ufs2_daddr_t nblk; ufs2_daddr_t blk; ufs_lbn_t lbn; int size; int mode; int off; - if (getino(&disk, &ip, ROOTINO, &mode) != 0) { + if (getino(&disk, &ip, UFS_ROOTINO, &mode) != 0) { warn("Failed to get root inode"); sbdirty(); return (-1); } dp2 = ip; dp1 = ip; blk = 0; size = 0; nblk = journal_balloc(); if (nblk <= 0) return (-1); /* - * For simplicity sake we aways extend the ROOTINO into a new + * For simplicity sake we aways extend the UFS_ROOTINO into a new * directory block rather than searching for space and inserting * into an existing block. However, if the rootino has frags * have to free them and extend the block. */ if (sblock.fs_magic == FS_UFS1_MAGIC) { lbn = lblkno(&sblock, dp1->di_size); off = blkoff(&sblock, dp1->di_size); blk = dp1->di_db[lbn]; size = sblksize(&sblock, (off_t)dp1->di_size, lbn); } else { lbn = lblkno(&sblock, dp2->di_size); off = blkoff(&sblock, dp2->di_size); blk = dp2->di_db[lbn]; size = sblksize(&sblock, (off_t)dp2->di_size, lbn); } if (off != 0) { if (dir_extend(blk, nblk, off, ino) == -1) return (-1); } else { blk = 0; if (dir_insert(nblk, 0, ino) == -1) return (-1); } if (sblock.fs_magic == FS_UFS1_MAGIC) { dp1->di_blocks += (sblock.fs_bsize - size) / DEV_BSIZE; dp1->di_db[lbn] = nblk; dp1->di_size = lblktosize(&sblock, lbn+1); } else { dp2->di_blocks += (sblock.fs_bsize - size) / DEV_BSIZE; dp2->di_db[lbn] = nblk; dp2->di_size = lblktosize(&sblock, lbn+1); } if (putino(&disk) < 0) { warn("Failed to write root inode"); return (-1); } if (cgwrite(&disk) < 0) { warn("Failed to write updated cg"); sbdirty(); return (-1); } if (blk) { if (cgbfree(&disk, blk, size) < 0) { warn("Failed to write cg"); return (-1); } } return (0); } static int indir_fill(ufs2_daddr_t blk, int level, int *resid) { char indirbuf[MAXBSIZE]; ufs1_daddr_t *bap1; ufs2_daddr_t *bap2; ufs2_daddr_t nblk; int ncnt; int cnt; int i; bzero(indirbuf, sizeof(indirbuf)); bap1 = (ufs1_daddr_t *)indirbuf; bap2 = (void *)bap1; cnt = 0; for (i = 0; i < NINDIR(&sblock) && *resid != 0; i++) { nblk = journal_balloc(); if (nblk <= 0) return (-1); cnt++; if (sblock.fs_magic == FS_UFS1_MAGIC) *bap1++ = nblk; else *bap2++ = nblk; if (level != 0) { ncnt = indir_fill(nblk, level - 1, resid); if (ncnt <= 0) return (-1); cnt += ncnt; } else (*resid)--; } if (bwrite(&disk, fsbtodb(&sblock, blk), indirbuf, sblock.fs_bsize) <= 0) { warn("Failed to write indirect"); return (-1); } return (cnt); } /* * Clear the flag bits so the journal can be removed. */ static void journal_clear(void) { struct ufs1_dinode *dp1; struct ufs2_dinode *dp2; ino_t ino; int mode; void *ip; ino = journal_findfile(); if (ino == (ino_t)-1 || ino == 0) { warnx("Journal file does not exist"); return; } printf("Clearing journal flags from inode %ju\n", (uintmax_t)ino); if (getino(&disk, &ip, ino, &mode) != 0) { warn("Failed to get journal inode"); return; } dp2 = ip; dp1 = ip; if (sblock.fs_magic == FS_UFS1_MAGIC) dp1->di_flags = 0; else dp2->di_flags = 0; if (putino(&disk) < 0) { warn("Failed to write journal inode"); return; } } static int journal_alloc(int64_t size) { struct ufs1_dinode *dp1; struct ufs2_dinode *dp2; ufs2_daddr_t blk; void *ip; struct cg *cgp; int resid; ino_t ino; int blks; int mode; time_t utime; int i; cgp = &disk.d_cg; ino = 0; /* * If the journal file exists we can't allocate it. */ ino = journal_findfile(); if (ino == (ino_t)-1) return (-1); if (ino > 0) { warnx("Journal file %s already exists, please remove.", SUJ_FILE); return (-1); } /* * If the user didn't supply a size pick one based on the filesystem * size constrained with hardcoded MIN and MAX values. We opt for * 1/1024th of the filesystem up to MAX but not exceeding one CG and * not less than the MIN. */ if (size == 0) { size = (sblock.fs_size * sblock.fs_bsize) / 1024; size = MIN(SUJ_MAX, size); if (size / sblock.fs_fsize > sblock.fs_fpg) size = sblock.fs_fpg * sblock.fs_fsize; size = MAX(SUJ_MIN, size); /* fsck does not support fragments in journal files. */ size = roundup(size, sblock.fs_bsize); } resid = blocks = size / sblock.fs_bsize; if (sblock.fs_cstotal.cs_nbfree < blocks) { warn("Insufficient free space for %jd byte journal", size); return (-1); } /* * Find a cg with enough blocks to satisfy the journal * size. Presently the journal does not span cgs. */ while (cgread(&disk) == 1) { if (cgp->cg_cs.cs_nifree == 0) continue; ino = cgialloc(&disk); if (ino <= 0) break; printf("Using inode %ju in cg %d for %jd byte journal\n", (uintmax_t)ino, cgp->cg_cgx, size); if (getino(&disk, &ip, ino, &mode) != 0) { warn("Failed to get allocated inode"); sbdirty(); goto out; } /* * We leave fields unrelated to the number of allocated * blocks and size uninitialized. This causes legacy * fsck implementations to clear the inode. */ dp2 = ip; dp1 = ip; time(&utime); if (sblock.fs_magic == FS_UFS1_MAGIC) { bzero(dp1, sizeof(*dp1)); dp1->di_size = size; dp1->di_mode = IFREG | IREAD; dp1->di_nlink = 1; dp1->di_flags = SF_IMMUTABLE | SF_NOUNLINK | UF_NODUMP; dp1->di_atime = utime; dp1->di_mtime = utime; dp1->di_ctime = utime; } else { bzero(dp2, sizeof(*dp2)); dp2->di_size = size; dp2->di_mode = IFREG | IREAD; dp2->di_nlink = 1; dp2->di_flags = SF_IMMUTABLE | SF_NOUNLINK | UF_NODUMP; dp2->di_atime = utime; dp2->di_mtime = utime; dp2->di_ctime = utime; dp2->di_birthtime = utime; } - for (i = 0; i < NDADDR && resid; i++, resid--) { + for (i = 0; i < UFS_NDADDR && resid; i++, resid--) { blk = journal_balloc(); if (blk <= 0) goto out; if (sblock.fs_magic == FS_UFS1_MAGIC) { dp1->di_db[i] = blk; dp1->di_blocks++; } else { dp2->di_db[i] = blk; dp2->di_blocks++; } } - for (i = 0; i < NIADDR && resid; i++) { + for (i = 0; i < UFS_NIADDR && resid; i++) { blk = journal_balloc(); if (blk <= 0) goto out; blks = indir_fill(blk, i, &resid) + 1; if (blks <= 0) { sbdirty(); goto out; } if (sblock.fs_magic == FS_UFS1_MAGIC) { dp1->di_ib[i] = blk; dp1->di_blocks += blks; } else { dp2->di_ib[i] = blk; dp2->di_blocks += blks; } } if (sblock.fs_magic == FS_UFS1_MAGIC) dp1->di_blocks *= sblock.fs_bsize / disk.d_bsize; else dp2->di_blocks *= sblock.fs_bsize / disk.d_bsize; if (putino(&disk) < 0) { warn("Failed to write inode"); sbdirty(); return (-1); } if (cgwrite(&disk) < 0) { warn("Failed to write updated cg"); sbdirty(); return (-1); } if (journal_insertfile(ino) < 0) { sbdirty(); return (-1); } sblock.fs_sujfree = 0; return (0); } warnx("Insufficient free space for the journal."); out: return (-1); } static void usage(void) { fprintf(stderr, "%s\n%s\n%s\n%s\n%s\n%s\n", "usage: tunefs [-A] [-a enable | disable] [-e maxbpg] [-f avgfilesize]", " [-J enable | disable] [-j enable | disable] [-k metaspace]", " [-L volname] [-l enable | disable] [-m minfree]", " [-N enable | disable] [-n enable | disable]", " [-o space | time] [-p] [-s avgfpdir] [-t enable | disable]", " special | filesystem"); exit(2); } static void printfs(void) { warnx("POSIX.1e ACLs: (-a) %s", (sblock.fs_flags & FS_ACLS)? "enabled" : "disabled"); warnx("NFSv4 ACLs: (-N) %s", (sblock.fs_flags & FS_NFS4ACLS)? "enabled" : "disabled"); warnx("MAC multilabel: (-l) %s", (sblock.fs_flags & FS_MULTILABEL)? "enabled" : "disabled"); warnx("soft updates: (-n) %s", (sblock.fs_flags & FS_DOSOFTDEP)? "enabled" : "disabled"); warnx("soft update journaling: (-j) %s", (sblock.fs_flags & FS_SUJ)? "enabled" : "disabled"); warnx("gjournal: (-J) %s", (sblock.fs_flags & FS_GJOURNAL)? "enabled" : "disabled"); warnx("trim: (-t) %s", (sblock.fs_flags & FS_TRIM)? "enabled" : "disabled"); warnx("maximum blocks per file in a cylinder group: (-e) %d", sblock.fs_maxbpg); warnx("average file size: (-f) %d", sblock.fs_avgfilesize); warnx("average number of files in a directory: (-s) %d", sblock.fs_avgfpdir); warnx("minimum percentage of free space: (-m) %d%%", sblock.fs_minfree); warnx("space to hold for metadata blocks: (-k) %jd", sblock.fs_metaspace); warnx("optimization preference: (-o) %s", sblock.fs_optim == FS_OPTSPACE ? "space" : "time"); if (sblock.fs_minfree >= MINFREE && sblock.fs_optim == FS_OPTSPACE) warnx(OPTWARN, "time", ">=", MINFREE); if (sblock.fs_minfree < MINFREE && sblock.fs_optim == FS_OPTTIME) warnx(OPTWARN, "space", "<", MINFREE); warnx("volume label: (-L) %s", sblock.fs_volname); } Index: head/sys/boot/common/ufsread.c =================================================================== --- head/sys/boot/common/ufsread.c (revision 313779) +++ head/sys/boot/common/ufsread.c (revision 313780) @@ -1,326 +1,326 @@ /*- * Copyright (c) 2002 McAfee, Inc. * All rights reserved. * * This software was developed for the FreeBSD Project by Marshall * Kirk McKusick and McAfee Research,, the Security Research Division of * McAfee, Inc. under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as * part of the DARPA CHATS research program * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /*- * Copyright (c) 1998 Robert Nordier * All rights reserved. * * Redistribution and use in source and binary forms are freely * permitted provided that the above copyright notice and this * paragraph and the following disclaimer are duplicated in all * such forms. * * This software is provided "AS IS" and without any express or * implied warranties, including, without limitation, the implied * warranties of merchantability and fitness for a particular * purpose. */ #include __FBSDID("$FreeBSD$"); #include #include #include #ifdef UFS_SMALL_CGBASE /* XXX: Revert to old (broken for over 1.5Tb filesystems) version of cgbase (see sys/ufs/ffs/fs.h rev 1.39) so that small boot loaders (e.g. boot2) can support both UFS1 and UFS2. */ #undef cgbase #define cgbase(fs, c) ((ufs2_daddr_t)((fs)->fs_fpg * (c))) #endif typedef uint32_t ufs_ino_t; /* * We use 4k `virtual' blocks for filesystem data, whatever the actual * filesystem block size. FFS blocks are always a multiple of 4k. */ #define VBLKSHIFT 12 #define VBLKSIZE (1 << VBLKSHIFT) #define VBLKMASK (VBLKSIZE - 1) #define DBPERVBLK (VBLKSIZE / DEV_BSIZE) #define INDIRPERVBLK(fs) (NINDIR(fs) / ((fs)->fs_bsize >> VBLKSHIFT)) #define IPERVBLK(fs) (INOPB(fs) / ((fs)->fs_bsize >> VBLKSHIFT)) #define INO_TO_VBA(fs, ipervblk, x) \ (fsbtodb(fs, cgimin(fs, ino_to_cg(fs, x))) + \ (((x) % (fs)->fs_ipg) / (ipervblk) * DBPERVBLK)) #define INO_TO_VBO(ipervblk, x) ((x) % ipervblk) #define FS_TO_VBA(fs, fsb, off) (fsbtodb(fs, fsb) + \ ((off) / VBLKSIZE) * DBPERVBLK) #define FS_TO_VBO(fs, fsb, off) ((off) & VBLKMASK) /* Buffers that must not span a 64k boundary. */ struct dmadat { char blkbuf[VBLKSIZE]; /* filesystem blocks */ char indbuf[VBLKSIZE]; /* indir blocks */ char sbbuf[SBLOCKSIZE]; /* superblock */ char secbuf[DEV_BSIZE]; /* for MBR/disklabel */ }; static struct dmadat *dmadat; static ufs_ino_t lookup(const char *); static ssize_t fsread(ufs_ino_t, void *, size_t); static uint8_t ls, dsk_meta; static uint32_t fs_off; static __inline uint8_t fsfind(const char *name, ufs_ino_t * ino) { static char buf[DEV_BSIZE]; static struct direct d; char *s; ssize_t n; fs_off = 0; while ((n = fsread(*ino, buf, DEV_BSIZE)) > 0) for (s = buf; s < buf + DEV_BSIZE;) { memcpy(&d, s, sizeof(struct direct)); if (ls) printf("%s ", d.d_name); else if (!strcmp(name, d.d_name)) { *ino = d.d_ino; return d.d_type; } s += d.d_reclen; } if (n != -1 && ls) printf("\n"); return 0; } static ufs_ino_t lookup(const char *path) { static char name[MAXNAMLEN + 1]; const char *s; ufs_ino_t ino; ssize_t n; uint8_t dt; - ino = ROOTINO; + ino = UFS_ROOTINO; dt = DT_DIR; for (;;) { if (*path == '/') path++; if (!*path) break; for (s = path; *s && *s != '/'; s++); if ((n = s - path) > MAXNAMLEN) return 0; ls = *path == '?' && n == 1 && !*s; memcpy(name, path, n); name[n] = 0; if (dt != DT_DIR) { printf("%s: not a directory.\n", name); return (0); } if ((dt = fsfind(name, &ino)) <= 0) break; path = s; } return dt == DT_REG ? ino : 0; } /* * Possible superblock locations ordered from most to least likely. */ static int sblock_try[] = SBLOCKSEARCH; #if defined(UFS2_ONLY) #define DIP(field) dp2.field #elif defined(UFS1_ONLY) #define DIP(field) dp1.field #else #define DIP(field) fs.fs_magic == FS_UFS1_MAGIC ? dp1.field : dp2.field #endif static ssize_t fsread_size(ufs_ino_t inode, void *buf, size_t nbyte, size_t *fsizep) { #ifndef UFS2_ONLY static struct ufs1_dinode dp1; ufs1_daddr_t addr1; #endif #ifndef UFS1_ONLY static struct ufs2_dinode dp2; #endif static struct fs fs; static ufs_ino_t inomap; char *blkbuf; void *indbuf; char *s; size_t n, nb, size, off, vboff; ufs_lbn_t lbn; ufs2_daddr_t addr2, vbaddr; static ufs2_daddr_t blkmap, indmap; u_int u; /* Basic parameter validation. */ if ((buf == NULL && nbyte != 0) || dmadat == NULL) return (-1); blkbuf = dmadat->blkbuf; indbuf = dmadat->indbuf; /* * Force probe if inode is zero to ensure we have a valid fs, otherwise * when probing multiple paritions, reads from subsequent parititions * will incorrectly succeed. */ if (!dsk_meta || inode == 0) { inomap = 0; dsk_meta = 0; for (n = 0; sblock_try[n] != -1; n++) { if (dskread(dmadat->sbbuf, sblock_try[n] / DEV_BSIZE, SBLOCKSIZE / DEV_BSIZE)) return -1; memcpy(&fs, dmadat->sbbuf, sizeof(struct fs)); if (( #if defined(UFS1_ONLY) fs.fs_magic == FS_UFS1_MAGIC #elif defined(UFS2_ONLY) (fs.fs_magic == FS_UFS2_MAGIC && fs.fs_sblockloc == sblock_try[n]) #else fs.fs_magic == FS_UFS1_MAGIC || (fs.fs_magic == FS_UFS2_MAGIC && fs.fs_sblockloc == sblock_try[n]) #endif ) && fs.fs_bsize <= MAXBSIZE && fs.fs_bsize >= (int32_t)sizeof(struct fs)) break; } if (sblock_try[n] == -1) { return -1; } dsk_meta++; } else memcpy(&fs, dmadat->sbbuf, sizeof(struct fs)); if (!inode) return 0; if (inomap != inode) { n = IPERVBLK(&fs); if (dskread(blkbuf, INO_TO_VBA(&fs, n, inode), DBPERVBLK)) return -1; n = INO_TO_VBO(n, inode); #if defined(UFS1_ONLY) memcpy(&dp1, (struct ufs1_dinode *)(void *)blkbuf + n, sizeof(dp1)); #elif defined(UFS2_ONLY) memcpy(&dp2, (struct ufs2_dinode *)(void *)blkbuf + n, sizeof(dp2)); #else if (fs.fs_magic == FS_UFS1_MAGIC) memcpy(&dp1, (struct ufs1_dinode *)(void *)blkbuf + n, sizeof(dp1)); else memcpy(&dp2, (struct ufs2_dinode *)(void *)blkbuf + n, sizeof(dp2)); #endif inomap = inode; fs_off = 0; blkmap = indmap = 0; } s = buf; size = DIP(di_size); n = size - fs_off; if (nbyte > n) nbyte = n; nb = nbyte; while (nb) { lbn = lblkno(&fs, fs_off); off = blkoff(&fs, fs_off); - if (lbn < NDADDR) { + if (lbn < UFS_NDADDR) { addr2 = DIP(di_db[lbn]); - } else if (lbn < NDADDR + NINDIR(&fs)) { + } else if (lbn < UFS_NDADDR + NINDIR(&fs)) { n = INDIRPERVBLK(&fs); addr2 = DIP(di_ib[0]); - u = (u_int)(lbn - NDADDR) / n * DBPERVBLK; + u = (u_int)(lbn - UFS_NDADDR) / n * DBPERVBLK; vbaddr = fsbtodb(&fs, addr2) + u; if (indmap != vbaddr) { if (dskread(indbuf, vbaddr, DBPERVBLK)) return -1; indmap = vbaddr; } - n = (lbn - NDADDR) & (n - 1); + n = (lbn - UFS_NDADDR) & (n - 1); #if defined(UFS1_ONLY) memcpy(&addr1, (ufs1_daddr_t *)indbuf + n, sizeof(ufs1_daddr_t)); addr2 = addr1; #elif defined(UFS2_ONLY) memcpy(&addr2, (ufs2_daddr_t *)indbuf + n, sizeof(ufs2_daddr_t)); #else if (fs.fs_magic == FS_UFS1_MAGIC) { memcpy(&addr1, (ufs1_daddr_t *)indbuf + n, sizeof(ufs1_daddr_t)); addr2 = addr1; } else memcpy(&addr2, (ufs2_daddr_t *)indbuf + n, sizeof(ufs2_daddr_t)); #endif } else return -1; vbaddr = fsbtodb(&fs, addr2) + (off >> VBLKSHIFT) * DBPERVBLK; vboff = off & VBLKMASK; n = sblksize(&fs, (off_t)size, lbn) - (off & ~VBLKMASK); if (n > VBLKSIZE) n = VBLKSIZE; if (blkmap != vbaddr) { if (dskread(blkbuf, vbaddr, n >> DEV_BSHIFT)) return -1; blkmap = vbaddr; } n -= vboff; if (n > nb) n = nb; memcpy(s, blkbuf + vboff, n); s += n; fs_off += n; nb -= n; } if (fsizep != NULL) *fsizep = size; return nbyte; } static ssize_t fsread(ufs_ino_t inode, void *buf, size_t nbyte) { return fsread_size(inode, buf, nbyte, NULL); } Index: head/sys/fs/ext2fs/ext2_alloc.c =================================================================== --- head/sys/fs/ext2fs/ext2_alloc.c (revision 313779) +++ head/sys/fs/ext2fs/ext2_alloc.c (revision 313780) @@ -1,1118 +1,1118 @@ /*- * modified for Lites 1.1 * * Aug 1995, Godmar Back (gback@cs.utah.edu) * University of Utah, Department of Computer Science */ /*- * Copyright (c) 1982, 1986, 1989, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ffs_alloc.c 8.8 (Berkeley) 2/21/94 * $FreeBSD$ */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include static daddr_t ext2_alloccg(struct inode *, int, daddr_t, int); static daddr_t ext2_clusteralloc(struct inode *, int, daddr_t, int); static u_long ext2_dirpref(struct inode *); static void ext2_fserr(struct m_ext2fs *, uid_t, char *); static u_long ext2_hashalloc(struct inode *, int, long, int, daddr_t (*)(struct inode *, int, daddr_t, int)); static daddr_t ext2_nodealloccg(struct inode *, int, daddr_t, int); static daddr_t ext2_mapsearch(struct m_ext2fs *, char *, daddr_t); /* * Allocate a block in the filesystem. * * A preference may be optionally specified. If a preference is given * the following hierarchy is used to allocate a block: * 1) allocate the requested block. * 2) allocate a rotationally optimal block in the same cylinder. * 3) allocate a block in the same cylinder group. * 4) quadradically rehash into other cylinder groups, until an * available block is located. * If no block preference is given the following hierarchy is used * to allocate a block: * 1) allocate a block in the cylinder group that contains the * inode for the file. * 2) quadradically rehash into other cylinder groups, until an * available block is located. */ int ext2_alloc(struct inode *ip, daddr_t lbn, e4fs_daddr_t bpref, int size, struct ucred *cred, e4fs_daddr_t *bnp) { struct m_ext2fs *fs; struct ext2mount *ump; int32_t bno; int cg; *bnp = 0; fs = ip->i_e2fs; ump = ip->i_ump; mtx_assert(EXT2_MTX(ump), MA_OWNED); #ifdef INVARIANTS if ((u_int)size > fs->e2fs_bsize || blkoff(fs, size) != 0) { vn_printf(ip->i_devvp, "bsize = %lu, size = %d, fs = %s\n", (long unsigned int)fs->e2fs_bsize, size, fs->e2fs_fsmnt); panic("ext2_alloc: bad size"); } if (cred == NOCRED) panic("ext2_alloc: missing credential"); #endif /* INVARIANTS */ if (size == fs->e2fs_bsize && fs->e2fs->e2fs_fbcount == 0) goto nospace; if (cred->cr_uid != 0 && fs->e2fs->e2fs_fbcount < fs->e2fs->e2fs_rbcount) goto nospace; if (bpref >= fs->e2fs->e2fs_bcount) bpref = 0; if (bpref == 0) cg = ino_to_cg(fs, ip->i_number); else cg = dtog(fs, bpref); bno = (daddr_t)ext2_hashalloc(ip, cg, bpref, fs->e2fs_bsize, ext2_alloccg); if (bno > 0) { /* set next_alloc fields as done in block_getblk */ ip->i_next_alloc_block = lbn; ip->i_next_alloc_goal = bno; ip->i_blocks += btodb(fs->e2fs_bsize); ip->i_flag |= IN_CHANGE | IN_UPDATE; *bnp = bno; return (0); } nospace: EXT2_UNLOCK(ump); ext2_fserr(fs, cred->cr_uid, "filesystem full"); uprintf("\n%s: write failed, filesystem is full\n", fs->e2fs_fsmnt); return (ENOSPC); } /* * Reallocate a sequence of blocks into a contiguous sequence of blocks. * * The vnode and an array of buffer pointers for a range of sequential * logical blocks to be made contiguous is given. The allocator attempts * to find a range of sequential blocks starting as close as possible to * an fs_rotdelay offset from the end of the allocation for the logical * block immediately preceding the current range. If successful, the * physical block numbers in the buffer pointers and in the inode are * changed to reflect the new allocation. If unsuccessful, the allocation * is left unchanged. The success in doing the reallocation is returned. * Note that the error return is not reflected back to the user. Rather * the previous block allocation will be used. */ static SYSCTL_NODE(_vfs, OID_AUTO, ext2fs, CTLFLAG_RW, 0, "EXT2FS filesystem"); static int doasyncfree = 1; SYSCTL_INT(_vfs_ext2fs, OID_AUTO, doasyncfree, CTLFLAG_RW, &doasyncfree, 0, "Use asychronous writes to update block pointers when freeing blocks"); static int doreallocblks = 1; SYSCTL_INT(_vfs_ext2fs, OID_AUTO, doreallocblks, CTLFLAG_RW, &doreallocblks, 0, ""); int ext2_reallocblks(struct vop_reallocblks_args *ap) { struct m_ext2fs *fs; struct inode *ip; struct vnode *vp; struct buf *sbp, *ebp; uint32_t *bap, *sbap, *ebap; struct ext2mount *ump; struct cluster_save *buflist; - struct indir start_ap[NIADDR + 1], end_ap[NIADDR + 1], *idp; + struct indir start_ap[EXT2_NIADDR + 1], end_ap[EXT2_NIADDR + 1], *idp; e2fs_lbn_t start_lbn, end_lbn; int soff; e2fs_daddr_t newblk, blkno; int i, len, start_lvl, end_lvl, pref, ssize; if (doreallocblks == 0) return (ENOSPC); vp = ap->a_vp; ip = VTOI(vp); fs = ip->i_e2fs; ump = ip->i_ump; if (fs->e2fs_contigsumsize <= 0) return (ENOSPC); buflist = ap->a_buflist; len = buflist->bs_nchildren; start_lbn = buflist->bs_children[0]->b_lblkno; end_lbn = start_lbn + len - 1; #ifdef INVARIANTS for (i = 1; i < len; i++) if (buflist->bs_children[i]->b_lblkno != start_lbn + i) panic("ext2_reallocblks: non-cluster"); #endif /* * If the cluster crosses the boundary for the first indirect * block, leave space for the indirect block. Indirect blocks * are initially laid out in a position after the last direct * block. Block reallocation would usually destroy locality by * moving the indirect block out of the way to make room for * data blocks if we didn't compensate here. We should also do * this for other indirect block boundaries, but it is only * important for the first one. */ - if (start_lbn < NDADDR && end_lbn >= NDADDR) + if (start_lbn < EXT2_NDADDR && end_lbn >= EXT2_NDADDR) return (ENOSPC); /* * If the latest allocation is in a new cylinder group, assume that * the filesystem has decided to move and do not force it back to * the previous cylinder group. */ if (dtog(fs, dbtofsb(fs, buflist->bs_children[0]->b_blkno)) != dtog(fs, dbtofsb(fs, buflist->bs_children[len - 1]->b_blkno))) return (ENOSPC); if (ext2_getlbns(vp, start_lbn, start_ap, &start_lvl) || ext2_getlbns(vp, end_lbn, end_ap, &end_lvl)) return (ENOSPC); /* * Get the starting offset and block map for the first block. */ if (start_lvl == 0) { sbap = &ip->i_db[0]; soff = start_lbn; } else { idp = &start_ap[start_lvl - 1]; if (bread(vp, idp->in_lbn, (int)fs->e2fs_bsize, NOCRED, &sbp)) { brelse(sbp); return (ENOSPC); } sbap = (u_int *)sbp->b_data; soff = idp->in_off; } /* * If the block range spans two block maps, get the second map. */ ebap = NULL; if (end_lvl == 0 || (idp = &end_ap[end_lvl - 1])->in_off + 1 >= len) { ssize = len; } else { #ifdef INVARIANTS if (start_ap[start_lvl - 1].in_lbn == idp->in_lbn) panic("ext2_reallocblks: start == end"); #endif ssize = len - (idp->in_off + 1); if (bread(vp, idp->in_lbn, (int)fs->e2fs_bsize, NOCRED, &ebp)) goto fail; ebap = (u_int *)ebp->b_data; } /* * Find the preferred location for the cluster. */ EXT2_LOCK(ump); pref = ext2_blkpref(ip, start_lbn, soff, sbap, 0); /* * Search the block map looking for an allocation of the desired size. */ if ((newblk = (e2fs_daddr_t)ext2_hashalloc(ip, dtog(fs, pref), pref, len, ext2_clusteralloc)) == 0) { EXT2_UNLOCK(ump); goto fail; } /* * We have found a new contiguous block. * * First we have to replace the old block pointers with the new * block pointers in the inode and indirect blocks associated * with the file. */ #ifdef DEBUG printf("realloc: ino %ju, lbns %jd-%jd\n\told:", (uintmax_t)ip->i_number, (intmax_t)start_lbn, (intmax_t)end_lbn); #endif /* DEBUG */ blkno = newblk; for (bap = &sbap[soff], i = 0; i < len; i++, blkno += fs->e2fs_fpb) { if (i == ssize) { bap = ebap; soff = -i; } #ifdef INVARIANTS if (buflist->bs_children[i]->b_blkno != fsbtodb(fs, *bap)) panic("ext2_reallocblks: alloc mismatch"); #endif #ifdef DEBUG printf(" %d,", *bap); #endif /* DEBUG */ *bap++ = blkno; } /* * Next we must write out the modified inode and indirect blocks. * For strict correctness, the writes should be synchronous since * the old block values may have been written to disk. In practise * they are almost never written, but if we are concerned about * strict correctness, the `doasyncfree' flag should be set to zero. * * The test on `doasyncfree' should be changed to test a flag * that shows whether the associated buffers and inodes have * been written. The flag should be set when the cluster is * started and cleared whenever the buffer or inode is flushed. * We can then check below to see if it is set, and do the * synchronous write only when it has been cleared. */ if (sbap != &ip->i_db[0]) { if (doasyncfree) bdwrite(sbp); else bwrite(sbp); } else { ip->i_flag |= IN_CHANGE | IN_UPDATE; if (!doasyncfree) ext2_update(vp, 1); } if (ssize < len) { if (doasyncfree) bdwrite(ebp); else bwrite(ebp); } /* * Last, free the old blocks and assign the new blocks to the buffers. */ #ifdef DEBUG printf("\n\tnew:"); #endif /* DEBUG */ for (blkno = newblk, i = 0; i < len; i++, blkno += fs->e2fs_fpb) { ext2_blkfree(ip, dbtofsb(fs, buflist->bs_children[i]->b_blkno), fs->e2fs_bsize); buflist->bs_children[i]->b_blkno = fsbtodb(fs, blkno); #ifdef DEBUG printf(" %d,", blkno); #endif /* DEBUG */ } #ifdef DEBUG printf("\n"); #endif /* DEBUG */ return (0); fail: if (ssize < len) brelse(ebp); if (sbap != &ip->i_db[0]) brelse(sbp); return (ENOSPC); } /* * Allocate an inode in the filesystem. * */ int ext2_valloc(struct vnode *pvp, int mode, struct ucred *cred, struct vnode **vpp) { struct timespec ts; struct inode *pip; struct m_ext2fs *fs; struct inode *ip; struct ext2mount *ump; ino_t ino, ipref; int i, error, cg; *vpp = NULL; pip = VTOI(pvp); fs = pip->i_e2fs; ump = pip->i_ump; EXT2_LOCK(ump); if (fs->e2fs->e2fs_ficount == 0) goto noinodes; /* * If it is a directory then obtain a cylinder group based on * ext2_dirpref else obtain it using ino_to_cg. The preferred inode is * always the next inode. */ if ((mode & IFMT) == IFDIR) { cg = ext2_dirpref(pip); if (fs->e2fs_contigdirs[cg] < 255) fs->e2fs_contigdirs[cg]++; } else { cg = ino_to_cg(fs, pip->i_number); if (fs->e2fs_contigdirs[cg] > 0) fs->e2fs_contigdirs[cg]--; } ipref = cg * fs->e2fs->e2fs_ipg + 1; ino = (ino_t)ext2_hashalloc(pip, cg, (long)ipref, mode, ext2_nodealloccg); if (ino == 0) goto noinodes; error = VFS_VGET(pvp->v_mount, ino, LK_EXCLUSIVE, vpp); if (error) { ext2_vfree(pvp, ino, mode); return (error); } ip = VTOI(*vpp); /* * The question is whether using VGET was such good idea at all: * Linux doesn't read the old inode in when it is allocating a * new one. I will set at least i_size and i_blocks to zero. */ ip->i_flag = 0; ip->i_size = 0; ip->i_blocks = 0; ip->i_mode = 0; ip->i_flags = 0; /* now we want to make sure that the block pointers are zeroed out */ - for (i = 0; i < NDADDR; i++) + for (i = 0; i < EXT2_NDADDR; i++) ip->i_db[i] = 0; - for (i = 0; i < NIADDR; i++) + for (i = 0; i < EXT2_NIADDR; i++) ip->i_ib[i] = 0; /* * Set up a new generation number for this inode. * Avoid zero values. */ do { ip->i_gen = arc4random(); } while (ip->i_gen == 0); vfs_timestamp(&ts); ip->i_birthtime = ts.tv_sec; ip->i_birthnsec = ts.tv_nsec; /* printf("ext2_valloc: allocated inode %d\n", ino); */ return (0); noinodes: EXT2_UNLOCK(ump); ext2_fserr(fs, cred->cr_uid, "out of inodes"); uprintf("\n%s: create/symlink failed, no inodes free\n", fs->e2fs_fsmnt); return (ENOSPC); } /* * Find a cylinder to place a directory. * * The policy implemented by this algorithm is to allocate a * directory inode in the same cylinder group as its parent * directory, but also to reserve space for its files inodes * and data. Restrict the number of directories which may be * allocated one after another in the same cylinder group * without intervening allocation of files. * * If we allocate a first level directory then force allocation * in another cylinder group. * */ static u_long ext2_dirpref(struct inode *pip) { struct m_ext2fs *fs; int cg, prefcg, cgsize; u_int avgifree, avgbfree, avgndir, curdirsize; u_int minifree, minbfree, maxndir; u_int mincg, minndir; u_int dirsize, maxcontigdirs; mtx_assert(EXT2_MTX(pip->i_ump), MA_OWNED); fs = pip->i_e2fs; avgifree = fs->e2fs->e2fs_ficount / fs->e2fs_gcount; avgbfree = fs->e2fs->e2fs_fbcount / fs->e2fs_gcount; avgndir = fs->e2fs_total_dir / fs->e2fs_gcount; /* * Force allocation in another cg if creating a first level dir. */ ASSERT_VOP_LOCKED(ITOV(pip), "ext2fs_dirpref"); if (ITOV(pip)->v_vflag & VV_ROOT) { prefcg = arc4random() % fs->e2fs_gcount; mincg = prefcg; minndir = fs->e2fs_ipg; for (cg = prefcg; cg < fs->e2fs_gcount; cg++) if (fs->e2fs_gd[cg].ext2bgd_ndirs < minndir && fs->e2fs_gd[cg].ext2bgd_nifree >= avgifree && fs->e2fs_gd[cg].ext2bgd_nbfree >= avgbfree) { mincg = cg; minndir = fs->e2fs_gd[cg].ext2bgd_ndirs; } for (cg = 0; cg < prefcg; cg++) if (fs->e2fs_gd[cg].ext2bgd_ndirs < minndir && fs->e2fs_gd[cg].ext2bgd_nifree >= avgifree && fs->e2fs_gd[cg].ext2bgd_nbfree >= avgbfree) { mincg = cg; minndir = fs->e2fs_gd[cg].ext2bgd_ndirs; } return (mincg); } /* * Count various limits which used for * optimal allocation of a directory inode. */ maxndir = min(avgndir + fs->e2fs_ipg / 16, fs->e2fs_ipg); minifree = avgifree - avgifree / 4; if (minifree < 1) minifree = 1; minbfree = avgbfree - avgbfree / 4; if (minbfree < 1) minbfree = 1; cgsize = fs->e2fs_fsize * fs->e2fs_fpg; dirsize = AVGDIRSIZE; curdirsize = avgndir ? (cgsize - avgbfree * fs->e2fs_bsize) / avgndir : 0; if (dirsize < curdirsize) dirsize = curdirsize; maxcontigdirs = min((avgbfree * fs->e2fs_bsize) / dirsize, 255); maxcontigdirs = min(maxcontigdirs, fs->e2fs_ipg / AFPDIR); if (maxcontigdirs == 0) maxcontigdirs = 1; /* * Limit number of dirs in one cg and reserve space for * regular files, but only if we have no deficit in * inodes or space. */ prefcg = ino_to_cg(fs, pip->i_number); for (cg = prefcg; cg < fs->e2fs_gcount; cg++) if (fs->e2fs_gd[cg].ext2bgd_ndirs < maxndir && fs->e2fs_gd[cg].ext2bgd_nifree >= minifree && fs->e2fs_gd[cg].ext2bgd_nbfree >= minbfree) { if (fs->e2fs_contigdirs[cg] < maxcontigdirs) return (cg); } for (cg = 0; cg < prefcg; cg++) if (fs->e2fs_gd[cg].ext2bgd_ndirs < maxndir && fs->e2fs_gd[cg].ext2bgd_nifree >= minifree && fs->e2fs_gd[cg].ext2bgd_nbfree >= minbfree) { if (fs->e2fs_contigdirs[cg] < maxcontigdirs) return (cg); } /* * This is a backstop when we have deficit in space. */ for (cg = prefcg; cg < fs->e2fs_gcount; cg++) if (fs->e2fs_gd[cg].ext2bgd_nifree >= avgifree) return (cg); for (cg = 0; cg < prefcg; cg++) if (fs->e2fs_gd[cg].ext2bgd_nifree >= avgifree) break; return (cg); } /* * Select the desired position for the next block in a file. * * we try to mimic what Remy does in inode_getblk/block_getblk * * we note: blocknr == 0 means that we're about to allocate either * a direct block or a pointer block at the first level of indirection * (In other words, stuff that will go in i_db[] or i_ib[]) * * blocknr != 0 means that we're allocating a block that is none * of the above. Then, blocknr tells us the number of the block * that will hold the pointer */ e4fs_daddr_t ext2_blkpref(struct inode *ip, e2fs_lbn_t lbn, int indx, e2fs_daddr_t *bap, e2fs_daddr_t blocknr) { int tmp; mtx_assert(EXT2_MTX(ip->i_ump), MA_OWNED); /* * If the next block is actually what we thought it is, then set the * goal to what we thought it should be. */ if (ip->i_next_alloc_block == lbn && ip->i_next_alloc_goal != 0) return ip->i_next_alloc_goal; /* * Now check whether we were provided with an array that basically * tells us previous blocks to which we want to stay close. */ if (bap) for (tmp = indx - 1; tmp >= 0; tmp--) if (bap[tmp]) return bap[tmp]; /* * Else lets fall back to the blocknr or, if there is none, follow * the rule that a block should be allocated near its inode. */ return blocknr ? blocknr : (e2fs_daddr_t)(ip->i_block_group * EXT2_BLOCKS_PER_GROUP(ip->i_e2fs)) + ip->i_e2fs->e2fs->e2fs_first_dblock; } /* * Implement the cylinder overflow algorithm. * * The policy implemented by this algorithm is: * 1) allocate the block in its requested cylinder group. * 2) quadradically rehash on the cylinder group number. * 3) brute force search for a free block. */ static u_long ext2_hashalloc(struct inode *ip, int cg, long pref, int size, daddr_t (*allocator) (struct inode *, int, daddr_t, int)) { struct m_ext2fs *fs; ino_t result; int i, icg = cg; mtx_assert(EXT2_MTX(ip->i_ump), MA_OWNED); fs = ip->i_e2fs; /* * 1: preferred cylinder group */ result = (*allocator)(ip, cg, pref, size); if (result) return (result); /* * 2: quadratic rehash */ for (i = 1; i < fs->e2fs_gcount; i *= 2) { cg += i; if (cg >= fs->e2fs_gcount) cg -= fs->e2fs_gcount; result = (*allocator)(ip, cg, 0, size); if (result) return (result); } /* * 3: brute force search * Note that we start at i == 2, since 0 was checked initially, * and 1 is always checked in the quadratic rehash. */ cg = (icg + 2) % fs->e2fs_gcount; for (i = 2; i < fs->e2fs_gcount; i++) { result = (*allocator)(ip, cg, 0, size); if (result) return (result); cg++; if (cg == fs->e2fs_gcount) cg = 0; } return (0); } /* * Determine whether a block can be allocated. * * Check to see if a block of the appropriate size is available, * and if it is, allocate it. */ static daddr_t ext2_alloccg(struct inode *ip, int cg, daddr_t bpref, int size) { struct m_ext2fs *fs; struct buf *bp; struct ext2mount *ump; daddr_t bno, runstart, runlen; int bit, loc, end, error, start; char *bbp; /* XXX ondisk32 */ fs = ip->i_e2fs; ump = ip->i_ump; if (fs->e2fs_gd[cg].ext2bgd_nbfree == 0) return (0); EXT2_UNLOCK(ump); error = bread(ip->i_devvp, fsbtodb(fs, fs->e2fs_gd[cg].ext2bgd_b_bitmap), (int)fs->e2fs_bsize, NOCRED, &bp); if (error) { brelse(bp); EXT2_LOCK(ump); return (0); } if (fs->e2fs_gd[cg].ext2bgd_nbfree == 0) { /* * Another thread allocated the last block in this * group while we were waiting for the buffer. */ brelse(bp); EXT2_LOCK(ump); return (0); } bbp = (char *)bp->b_data; if (dtog(fs, bpref) != cg) bpref = 0; if (bpref != 0) { bpref = dtogd(fs, bpref); /* * if the requested block is available, use it */ if (isclr(bbp, bpref)) { bno = bpref; goto gotit; } } /* * no blocks in the requested cylinder, so take next * available one in this cylinder group. * first try to get 8 contigous blocks, then fall back to a single * block. */ if (bpref) start = dtogd(fs, bpref) / NBBY; else start = 0; end = howmany(fs->e2fs->e2fs_fpg, NBBY) - start; retry: runlen = 0; runstart = 0; for (loc = start; loc < end; loc++) { if (bbp[loc] == (char)0xff) { runlen = 0; continue; } /* Start of a run, find the number of high clear bits. */ if (runlen == 0) { bit = fls(bbp[loc]); runlen = NBBY - bit; runstart = loc * NBBY + bit; } else if (bbp[loc] == 0) { /* Continue a run. */ runlen += NBBY; } else { /* * Finish the current run. If it isn't long * enough, start a new one. */ bit = ffs(bbp[loc]) - 1; runlen += bit; if (runlen >= 8) { bno = runstart; goto gotit; } /* Run was too short, start a new one. */ bit = fls(bbp[loc]); runlen = NBBY - bit; runstart = loc * NBBY + bit; } /* If the current run is long enough, use it. */ if (runlen >= 8) { bno = runstart; goto gotit; } } if (start != 0) { end = start; start = 0; goto retry; } bno = ext2_mapsearch(fs, bbp, bpref); if (bno < 0) { brelse(bp); EXT2_LOCK(ump); return (0); } gotit: #ifdef INVARIANTS if (isset(bbp, bno)) { printf("ext2fs_alloccgblk: cg=%d bno=%jd fs=%s\n", cg, (intmax_t)bno, fs->e2fs_fsmnt); panic("ext2fs_alloccg: dup alloc"); } #endif setbit(bbp, bno); EXT2_LOCK(ump); ext2_clusteracct(fs, bbp, cg, bno, -1); fs->e2fs->e2fs_fbcount--; fs->e2fs_gd[cg].ext2bgd_nbfree--; fs->e2fs_fmod = 1; EXT2_UNLOCK(ump); bdwrite(bp); return (cg * fs->e2fs->e2fs_fpg + fs->e2fs->e2fs_first_dblock + bno); } /* * Determine whether a cluster can be allocated. */ static daddr_t ext2_clusteralloc(struct inode *ip, int cg, daddr_t bpref, int len) { struct m_ext2fs *fs; struct ext2mount *ump; struct buf *bp; char *bbp; int bit, error, got, i, loc, run; int32_t *lp; daddr_t bno; fs = ip->i_e2fs; ump = ip->i_ump; if (fs->e2fs_maxcluster[cg] < len) return (0); EXT2_UNLOCK(ump); error = bread(ip->i_devvp, fsbtodb(fs, fs->e2fs_gd[cg].ext2bgd_b_bitmap), (int)fs->e2fs_bsize, NOCRED, &bp); if (error) goto fail_lock; bbp = (char *)bp->b_data; EXT2_LOCK(ump); /* * Check to see if a cluster of the needed size (or bigger) is * available in this cylinder group. */ lp = &fs->e2fs_clustersum[cg].cs_sum[len]; for (i = len; i <= fs->e2fs_contigsumsize; i++) if (*lp++ > 0) break; if (i > fs->e2fs_contigsumsize) { /* * Update the cluster summary information to reflect * the true maximum-sized cluster so that future cluster * allocation requests can avoid reading the bitmap only * to find no cluster. */ lp = &fs->e2fs_clustersum[cg].cs_sum[len - 1]; for (i = len - 1; i > 0; i--) if (*lp-- > 0) break; fs->e2fs_maxcluster[cg] = i; goto fail; } EXT2_UNLOCK(ump); /* Search the bitmap to find a big enough cluster like in FFS. */ if (dtog(fs, bpref) != cg) bpref = 0; if (bpref != 0) bpref = dtogd(fs, bpref); loc = bpref / NBBY; bit = 1 << (bpref % NBBY); for (run = 0, got = bpref; got < fs->e2fs->e2fs_fpg; got++) { if ((bbp[loc] & bit) != 0) run = 0; else { run++; if (run == len) break; } if ((got & (NBBY - 1)) != (NBBY - 1)) bit <<= 1; else { loc++; bit = 1; } } if (got >= fs->e2fs->e2fs_fpg) goto fail_lock; /* Allocate the cluster that we found. */ for (i = 1; i < len; i++) if (!isclr(bbp, got - run + i)) panic("ext2_clusteralloc: map mismatch"); bno = got - run + 1; if (bno >= fs->e2fs->e2fs_fpg) panic("ext2_clusteralloc: allocated out of group"); EXT2_LOCK(ump); for (i = 0; i < len; i += fs->e2fs_fpb) { setbit(bbp, bno + i); ext2_clusteracct(fs, bbp, cg, bno + i, -1); fs->e2fs->e2fs_fbcount--; fs->e2fs_gd[cg].ext2bgd_nbfree--; } fs->e2fs_fmod = 1; EXT2_UNLOCK(ump); bdwrite(bp); return (cg * fs->e2fs->e2fs_fpg + fs->e2fs->e2fs_first_dblock + bno); fail_lock: EXT2_LOCK(ump); fail: brelse(bp); return (0); } /* * Determine whether an inode can be allocated. * * Check to see if an inode is available, and if it is, * allocate it using tode in the specified cylinder group. */ static daddr_t ext2_nodealloccg(struct inode *ip, int cg, daddr_t ipref, int mode) { struct m_ext2fs *fs; struct buf *bp; struct ext2mount *ump; int error, start, len; char *ibp, *loc; ipref--; /* to avoid a lot of (ipref -1) */ if (ipref == -1) ipref = 0; fs = ip->i_e2fs; ump = ip->i_ump; if (fs->e2fs_gd[cg].ext2bgd_nifree == 0) return (0); EXT2_UNLOCK(ump); error = bread(ip->i_devvp, fsbtodb(fs, fs->e2fs_gd[cg].ext2bgd_i_bitmap), (int)fs->e2fs_bsize, NOCRED, &bp); if (error) { brelse(bp); EXT2_LOCK(ump); return (0); } if (fs->e2fs_gd[cg].ext2bgd_nifree == 0) { /* * Another thread allocated the last i-node in this * group while we were waiting for the buffer. */ brelse(bp); EXT2_LOCK(ump); return (0); } ibp = (char *)bp->b_data; if (ipref) { ipref %= fs->e2fs->e2fs_ipg; if (isclr(ibp, ipref)) goto gotit; } start = ipref / NBBY; len = howmany(fs->e2fs->e2fs_ipg - ipref, NBBY); loc = memcchr(&ibp[start], 0xff, len); if (loc == NULL) { len = start + 1; start = 0; loc = memcchr(&ibp[start], 0xff, len); if (loc == NULL) { printf("cg = %d, ipref = %lld, fs = %s\n", cg, (long long)ipref, fs->e2fs_fsmnt); panic("ext2fs_nodealloccg: map corrupted"); /* NOTREACHED */ } } ipref = (loc - ibp) * NBBY + ffs(~*loc) - 1; gotit: setbit(ibp, ipref); EXT2_LOCK(ump); fs->e2fs_gd[cg].ext2bgd_nifree--; fs->e2fs->e2fs_ficount--; fs->e2fs_fmod = 1; if ((mode & IFMT) == IFDIR) { fs->e2fs_gd[cg].ext2bgd_ndirs++; fs->e2fs_total_dir++; } EXT2_UNLOCK(ump); bdwrite(bp); return (cg * fs->e2fs->e2fs_ipg + ipref + 1); } /* * Free a block or fragment. * */ void ext2_blkfree(struct inode *ip, e4fs_daddr_t bno, long size) { struct m_ext2fs *fs; struct buf *bp; struct ext2mount *ump; int cg, error; char *bbp; fs = ip->i_e2fs; ump = ip->i_ump; cg = dtog(fs, bno); if ((u_int)bno >= fs->e2fs->e2fs_bcount) { printf("bad block %lld, ino %ju\n", (long long)bno, (uintmax_t)ip->i_number); ext2_fserr(fs, ip->i_uid, "bad block"); return; } error = bread(ip->i_devvp, fsbtodb(fs, fs->e2fs_gd[cg].ext2bgd_b_bitmap), (int)fs->e2fs_bsize, NOCRED, &bp); if (error) { brelse(bp); return; } bbp = (char *)bp->b_data; bno = dtogd(fs, bno); if (isclr(bbp, bno)) { printf("block = %lld, fs = %s\n", (long long)bno, fs->e2fs_fsmnt); panic("ext2_blkfree: freeing free block"); } clrbit(bbp, bno); EXT2_LOCK(ump); ext2_clusteracct(fs, bbp, cg, bno, 1); fs->e2fs->e2fs_fbcount++; fs->e2fs_gd[cg].ext2bgd_nbfree++; fs->e2fs_fmod = 1; EXT2_UNLOCK(ump); bdwrite(bp); } /* * Free an inode. * */ int ext2_vfree(struct vnode *pvp, ino_t ino, int mode) { struct m_ext2fs *fs; struct inode *pip; struct buf *bp; struct ext2mount *ump; int error, cg; char *ibp; pip = VTOI(pvp); fs = pip->i_e2fs; ump = pip->i_ump; if ((u_int)ino > fs->e2fs_ipg * fs->e2fs_gcount) panic("ext2_vfree: range: devvp = %p, ino = %ju, fs = %s", pip->i_devvp, (uintmax_t)ino, fs->e2fs_fsmnt); cg = ino_to_cg(fs, ino); error = bread(pip->i_devvp, fsbtodb(fs, fs->e2fs_gd[cg].ext2bgd_i_bitmap), (int)fs->e2fs_bsize, NOCRED, &bp); if (error) { brelse(bp); return (0); } ibp = (char *)bp->b_data; ino = (ino - 1) % fs->e2fs->e2fs_ipg; if (isclr(ibp, ino)) { printf("ino = %llu, fs = %s\n", (unsigned long long)ino, fs->e2fs_fsmnt); if (fs->e2fs_ronly == 0) panic("ext2_vfree: freeing free inode"); } clrbit(ibp, ino); EXT2_LOCK(ump); fs->e2fs->e2fs_ficount++; fs->e2fs_gd[cg].ext2bgd_nifree++; if ((mode & IFMT) == IFDIR) { fs->e2fs_gd[cg].ext2bgd_ndirs--; fs->e2fs_total_dir--; } fs->e2fs_fmod = 1; EXT2_UNLOCK(ump); bdwrite(bp); return (0); } /* * Find a block in the specified cylinder group. * * It is a panic if a request is made to find a block if none are * available. */ static daddr_t ext2_mapsearch(struct m_ext2fs *fs, char *bbp, daddr_t bpref) { char *loc; int start, len; /* * find the fragment by searching through the free block * map for an appropriate bit pattern */ if (bpref) start = dtogd(fs, bpref) / NBBY; else start = 0; len = howmany(fs->e2fs->e2fs_fpg, NBBY) - start; loc = memcchr(&bbp[start], 0xff, len); if (loc == NULL) { len = start + 1; start = 0; loc = memcchr(&bbp[start], 0xff, len); if (loc == NULL) { printf("start = %d, len = %d, fs = %s\n", start, len, fs->e2fs_fsmnt); panic("ext2_mapsearch: map corrupted"); /* NOTREACHED */ } } return ((loc - bbp) * NBBY + ffs(~*loc) - 1); } /* * Fserr prints the name of a filesystem with an error diagnostic. * * The form of the error message is: * fs: error message */ static void ext2_fserr(struct m_ext2fs *fs, uid_t uid, char *cp) { log(LOG_ERR, "uid %u on %s: %s\n", uid, fs->e2fs_fsmnt, cp); } int cg_has_sb(int i) { int a3, a5, a7; if (i == 0 || i == 1) return 1; for (a3 = 3, a5 = 5, a7 = 7; a3 <= i || a5 <= i || a7 <= i; a3 *= 3, a5 *= 5, a7 *= 7) if (i == a3 || i == a5 || i == a7) return 1; return 0; } Index: head/sys/fs/ext2fs/ext2_balloc.c =================================================================== --- head/sys/fs/ext2fs/ext2_balloc.c (revision 313779) +++ head/sys/fs/ext2fs/ext2_balloc.c (revision 313780) @@ -1,300 +1,300 @@ /*- * modified for Lites 1.1 * * Aug 1995, Godmar Back (gback@cs.utah.edu) * University of Utah, Department of Computer Science */ /*- * Copyright (c) 1982, 1986, 1989, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ffs_balloc.c 8.4 (Berkeley) 9/23/93 * $FreeBSD$ */ #include #include #include #include #include #include #include #include #include #include #include #include #include /* * Balloc defines the structure of filesystem storage * by allocating the physical blocks on a device given * the inode and the logical block number in a file. */ int ext2_balloc(struct inode *ip, e2fs_lbn_t lbn, int size, struct ucred *cred, struct buf **bpp, int flags) { struct m_ext2fs *fs; struct ext2mount *ump; struct buf *bp, *nbp; struct vnode *vp = ITOV(ip); - struct indir indirs[NIADDR + 2]; + struct indir indirs[EXT2_NIADDR + 2]; e4fs_daddr_t nb, newb; e2fs_daddr_t *bap, pref; int osize, nsize, num, i, error; *bpp = NULL; if (lbn < 0) return (EFBIG); fs = ip->i_e2fs; ump = ip->i_ump; /* * check if this is a sequential block allocation. * If so, increment next_alloc fields to allow ext2_blkpref * to make a good guess */ if (lbn == ip->i_next_alloc_block + 1) { ip->i_next_alloc_block++; ip->i_next_alloc_goal++; } /* - * The first NDADDR blocks are direct blocks + * The first EXT2_NDADDR blocks are direct blocks */ - if (lbn < NDADDR) { + if (lbn < EXT2_NDADDR) { nb = ip->i_db[lbn]; /* * no new block is to be allocated, and no need to expand * the file */ if (nb != 0 && ip->i_size >= (lbn + 1) * fs->e2fs_bsize) { error = bread(vp, lbn, fs->e2fs_bsize, NOCRED, &bp); if (error) { brelse(bp); return (error); } bp->b_blkno = fsbtodb(fs, nb); *bpp = bp; return (0); } if (nb != 0) { /* * Consider need to reallocate a fragment. */ osize = fragroundup(fs, blkoff(fs, ip->i_size)); nsize = fragroundup(fs, size); if (nsize <= osize) { error = bread(vp, lbn, osize, NOCRED, &bp); if (error) { brelse(bp); return (error); } bp->b_blkno = fsbtodb(fs, nb); } else { /* * Godmar thinks: this shouldn't happen w/o * fragments */ printf("nsize %d(%d) > osize %d(%d) nb %d\n", (int)nsize, (int)size, (int)osize, (int)ip->i_size, (int)nb); panic( "ext2_balloc: Something is terribly wrong"); /* * please note there haven't been any changes from here on - * FFS seems to work. */ } } else { if (ip->i_size < (lbn + 1) * fs->e2fs_bsize) nsize = fragroundup(fs, size); else nsize = fs->e2fs_bsize; EXT2_LOCK(ump); error = ext2_alloc(ip, lbn, ext2_blkpref(ip, lbn, (int)lbn, &ip->i_db[0], 0), nsize, cred, &newb); if (error) return (error); bp = getblk(vp, lbn, nsize, 0, 0, 0); bp->b_blkno = fsbtodb(fs, newb); if (flags & BA_CLRBUF) vfs_bio_clrbuf(bp); } ip->i_db[lbn] = dbtofsb(fs, bp->b_blkno); ip->i_flag |= IN_CHANGE | IN_UPDATE; *bpp = bp; return (0); } /* * Determine the number of levels of indirection. */ pref = 0; if ((error = ext2_getlbns(vp, lbn, indirs, &num)) != 0) return (error); #ifdef INVARIANTS if (num < 1) panic("ext2_balloc: ext2_getlbns returned indirect block"); #endif /* * Fetch the first indirect block allocating if necessary. */ --num; nb = ip->i_ib[indirs[0].in_off]; if (nb == 0) { EXT2_LOCK(ump); pref = ext2_blkpref(ip, lbn, indirs[0].in_off + EXT2_NDIR_BLOCKS, &ip->i_db[0], 0); if ((error = ext2_alloc(ip, lbn, pref, fs->e2fs_bsize, cred, &newb))) return (error); nb = newb; bp = getblk(vp, indirs[1].in_lbn, fs->e2fs_bsize, 0, 0, 0); bp->b_blkno = fsbtodb(fs, newb); vfs_bio_clrbuf(bp); /* * Write synchronously so that indirect blocks * never point at garbage. */ if ((error = bwrite(bp)) != 0) { ext2_blkfree(ip, nb, fs->e2fs_bsize); return (error); } ip->i_ib[indirs[0].in_off] = newb; ip->i_flag |= IN_CHANGE | IN_UPDATE; } /* * Fetch through the indirect blocks, allocating as necessary. */ for (i = 1;;) { error = bread(vp, indirs[i].in_lbn, (int)fs->e2fs_bsize, NOCRED, &bp); if (error) { brelse(bp); return (error); } bap = (e2fs_daddr_t *)bp->b_data; nb = bap[indirs[i].in_off]; if (i == num) break; i += 1; if (nb != 0) { bqrelse(bp); continue; } EXT2_LOCK(ump); if (pref == 0) pref = ext2_blkpref(ip, lbn, indirs[i].in_off, bap, bp->b_lblkno); error = ext2_alloc(ip, lbn, pref, (int)fs->e2fs_bsize, cred, &newb); if (error) { brelse(bp); return (error); } nb = newb; nbp = getblk(vp, indirs[i].in_lbn, fs->e2fs_bsize, 0, 0, 0); nbp->b_blkno = fsbtodb(fs, nb); vfs_bio_clrbuf(nbp); /* * Write synchronously so that indirect blocks * never point at garbage. */ if ((error = bwrite(nbp)) != 0) { ext2_blkfree(ip, nb, fs->e2fs_bsize); EXT2_UNLOCK(ump); brelse(bp); return (error); } bap[indirs[i - 1].in_off] = nb; /* * If required, write synchronously, otherwise use * delayed write. */ if (flags & IO_SYNC) { bwrite(bp); } else { if (bp->b_bufsize == fs->e2fs_bsize) bp->b_flags |= B_CLUSTEROK; bdwrite(bp); } } /* * Get the data block, allocating if necessary. */ if (nb == 0) { EXT2_LOCK(ump); pref = ext2_blkpref(ip, lbn, indirs[i].in_off, &bap[0], bp->b_lblkno); if ((error = ext2_alloc(ip, lbn, pref, (int)fs->e2fs_bsize, cred, &newb)) != 0) { brelse(bp); return (error); } nb = newb; nbp = getblk(vp, lbn, fs->e2fs_bsize, 0, 0, 0); nbp->b_blkno = fsbtodb(fs, nb); if (flags & BA_CLRBUF) vfs_bio_clrbuf(nbp); bap[indirs[i].in_off] = nb; /* * If required, write synchronously, otherwise use * delayed write. */ if (flags & IO_SYNC) { bwrite(bp); } else { if (bp->b_bufsize == fs->e2fs_bsize) bp->b_flags |= B_CLUSTEROK; bdwrite(bp); } *bpp = nbp; return (0); } brelse(bp); if (flags & BA_CLRBUF) { int seqcount = (flags & BA_SEQMASK) >> BA_SEQSHIFT; if (seqcount && (vp->v_mount->mnt_flag & MNT_NOCLUSTERR) == 0) { error = cluster_read(vp, ip->i_size, lbn, (int)fs->e2fs_bsize, NOCRED, MAXBSIZE, seqcount, 0, &nbp); } else { error = bread(vp, lbn, (int)fs->e2fs_bsize, NOCRED, &nbp); } if (error) { brelse(nbp); return (error); } } else { nbp = getblk(vp, lbn, fs->e2fs_bsize, 0, 0, 0); nbp->b_blkno = fsbtodb(fs, nb); } *bpp = nbp; return (0); } Index: head/sys/fs/ext2fs/ext2_bmap.c =================================================================== --- head/sys/fs/ext2fs/ext2_bmap.c (revision 313779) +++ head/sys/fs/ext2fs/ext2_bmap.c (revision 313780) @@ -1,385 +1,386 @@ /*- * Copyright (c) 1989, 1991, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ufs_bmap.c 8.7 (Berkeley) 3/21/95 * $FreeBSD$ */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static int ext4_bmapext(struct vnode *, int32_t, int64_t *, int *, int *); /* * Bmap converts the logical block number of a file to its physical block * number on the disk. The conversion is done by using the logical block * number to index into the array of block pointers described by the dinode. */ int ext2_bmap(struct vop_bmap_args *ap) { daddr_t blkno; int error; /* * Check for underlying vnode requests and ensure that logical * to physical mapping is requested. */ if (ap->a_bop != NULL) *ap->a_bop = &VTOI(ap->a_vp)->i_devvp->v_bufobj; if (ap->a_bnp == NULL) return (0); if (VTOI(ap->a_vp)->i_flag & IN_E4EXTENTS) error = ext4_bmapext(ap->a_vp, ap->a_bn, &blkno, ap->a_runp, ap->a_runb); else error = ext2_bmaparray(ap->a_vp, ap->a_bn, &blkno, ap->a_runp, ap->a_runb); *ap->a_bnp = blkno; return (error); } /* * Convert the logical block number of a file to its physical block number * on the disk within ext4 extents. */ static int ext4_bmapext(struct vnode *vp, int32_t bn, int64_t *bnp, int *runp, int *runb) { struct inode *ip; struct m_ext2fs *fs; struct ext4_extent *ep; struct ext4_extent_path path = {.ep_bp = NULL}; daddr_t lbn; int error; ip = VTOI(vp); fs = ip->i_e2fs; lbn = bn; if (runp != NULL) *runp = 0; if (runb != NULL) *runb = 0; error = 0; ext4_ext_find_extent(fs, ip, lbn, &path); if (path.ep_is_sparse) { *bnp = -1; if (runp != NULL) *runp = path.ep_sparse_ext.e_len - (lbn - path.ep_sparse_ext.e_blk) - 1; if (runb != NULL) *runb = lbn - path.ep_sparse_ext.e_blk; } else { if (path.ep_ext == NULL) { error = EIO; goto out; } ep = path.ep_ext; *bnp = fsbtodb(fs, lbn - ep->e_blk + (ep->e_start_lo | (daddr_t)ep->e_start_hi << 32)); if (*bnp == 0) *bnp = -1; if (runp != NULL) *runp = ep->e_len - (lbn - ep->e_blk) - 1; if (runb != NULL) *runb = lbn - ep->e_blk; } out: if (path.ep_bp != NULL) brelse(path.ep_bp); return (error); } /* * Indirect blocks are now on the vnode for the file. They are given negative * logical block numbers. Indirect blocks are addressed by the negative * address of the first data block to which they point. Double indirect blocks * are addressed by one less than the address of the first indirect block to * which they point. Triple indirect blocks are addressed by one less than * the address of the first double indirect block to which they point. * * ext2_bmaparray does the bmap conversion, and if requested returns the * array of logical blocks which must be traversed to get to a block. * Each entry contains the offset into that block that gets you to the * next block and the disk address of the block (if it is assigned). */ int ext2_bmaparray(struct vnode *vp, daddr_t bn, daddr_t *bnp, int *runp, int *runb) { struct inode *ip; struct buf *bp; struct ext2mount *ump; struct mount *mp; - struct indir a[NIADDR + 1], *ap; + struct indir a[EXT2_NIADDR + 1], *ap; daddr_t daddr; e2fs_lbn_t metalbn; int error, num, maxrun = 0, bsize; int *nump; ap = NULL; ip = VTOI(vp); mp = vp->v_mount; ump = VFSTOEXT2(mp); bsize = EXT2_BLOCK_SIZE(ump->um_e2fs); if (runp) { maxrun = mp->mnt_iosize_max / bsize - 1; *runp = 0; } if (runb) *runb = 0; ap = a; nump = # error = ext2_getlbns(vp, bn, ap, nump); if (error) return (error); num = *nump; if (num == 0) { *bnp = blkptrtodb(ump, ip->i_db[bn]); if (*bnp == 0) { *bnp = -1; } else if (runp) { daddr_t bnb = bn; - for (++bn; bn < NDADDR && *runp < maxrun && + for (++bn; bn < EXT2_NDADDR && *runp < maxrun && is_sequential(ump, ip->i_db[bn - 1], ip->i_db[bn]); ++bn, ++*runp); bn = bnb; if (runb && (bn > 0)) { for (--bn; (bn >= 0) && (*runb < maxrun) && is_sequential(ump, ip->i_db[bn], ip->i_db[bn + 1]); --bn, ++*runb); } } return (0); } /* Get disk address out of indirect block array */ daddr = ip->i_ib[ap->in_off]; for (bp = NULL, ++ap; --num; ++ap) { /* * Exit the loop if there is no disk address assigned yet and * the indirect block isn't in the cache, or if we were * looking for an indirect block and we've found it. */ metalbn = ap->in_lbn; if ((daddr == 0 && !incore(&vp->v_bufobj, metalbn)) || metalbn == bn) break; /* * If we get here, we've either got the block in the cache * or we have a disk address for it, go fetch it. */ if (bp) bqrelse(bp); bp = getblk(vp, metalbn, bsize, 0, 0, 0); if ((bp->b_flags & B_CACHE) == 0) { #ifdef INVARIANTS if (!daddr) panic("ext2_bmaparray: indirect block not in cache"); #endif bp->b_blkno = blkptrtodb(ump, daddr); bp->b_iocmd = BIO_READ; bp->b_flags &= ~B_INVAL; bp->b_ioflags &= ~BIO_ERROR; vfs_busy_pages(bp, 0); bp->b_iooffset = dbtob(bp->b_blkno); bstrategy(bp); #ifdef RACCT if (racct_enable) { PROC_LOCK(curproc); racct_add_buf(curproc, bp, 0); PROC_UNLOCK(curproc); } #endif curthread->td_ru.ru_inblock++; error = bufwait(bp); if (error) { brelse(bp); return (error); } } daddr = ((e2fs_daddr_t *)bp->b_data)[ap->in_off]; if (num == 1 && daddr && runp) { for (bn = ap->in_off + 1; bn < MNINDIR(ump) && *runp < maxrun && is_sequential(ump, ((e2fs_daddr_t *)bp->b_data)[bn - 1], ((e2fs_daddr_t *)bp->b_data)[bn]); ++bn, ++*runp); bn = ap->in_off; if (runb && bn) { for (--bn; bn >= 0 && *runb < maxrun && is_sequential(ump, ((e2fs_daddr_t *)bp->b_data)[bn], ((e2fs_daddr_t *)bp->b_data)[bn + 1]); --bn, ++*runb); } } } if (bp) bqrelse(bp); /* * Since this is FFS independent code, we are out of scope for the * definitions of BLK_NOCOPY and BLK_SNAP, but we do know that they * will fall in the range 1..um_seqinc, so we use that test and * return a request for a zeroed out buffer if attempts are made * to read a BLK_NOCOPY or BLK_SNAP block. */ if ((ip->i_flags & SF_SNAPSHOT) && daddr > 0 && daddr < ump->um_seqinc) { *bnp = -1; return (0); } *bnp = blkptrtodb(ump, daddr); if (*bnp == 0) { *bnp = -1; } return (0); } /* * Create an array of logical block number/offset pairs which represent the * path of indirect blocks required to access a data block. The first "pair" * contains the logical block number of the appropriate single, double or * triple indirect block and the offset into the inode indirect block array. * Note, the logical block number of the inode single/double/triple indirect * block appears twice in the array, once with the offset into the i_ib and * once with the offset into the page itself. */ int ext2_getlbns(struct vnode *vp, daddr_t bn, struct indir *ap, int *nump) { long blockcnt; e2fs_lbn_t metalbn, realbn; struct ext2mount *ump; int i, numlevels, off; int64_t qblockcnt; ump = VFSTOEXT2(vp->v_mount); if (nump) *nump = 0; numlevels = 0; realbn = bn; if ((long)bn < 0) bn = -(long)bn; - /* The first NDADDR blocks are direct blocks. */ - if (bn < NDADDR) + /* The first EXT2_NDADDR blocks are direct blocks. */ + if (bn < EXT2_NDADDR) return (0); /* * Determine the number of levels of indirection. After this loop * is done, blockcnt indicates the number of data blocks possible - * at the previous level of indirection, and NIADDR - i is the number - * of levels of indirection needed to locate the requested block. + * at the previous level of indirection, and EXT2_NIADDR - i is the + * number of levels of indirection needed to locate the requested block. */ - for (blockcnt = 1, i = NIADDR, bn -= NDADDR;; i--, bn -= blockcnt) { + for (blockcnt = 1, i = EXT2_NIADDR, bn -= EXT2_NDADDR; ; + i--, bn -= blockcnt) { if (i == 0) return (EFBIG); /* * Use int64_t's here to avoid overflow for triple indirect * blocks when longs have 32 bits and the block size is more * than 4K. */ qblockcnt = (int64_t)blockcnt * MNINDIR(ump); if (bn < qblockcnt) break; blockcnt = qblockcnt; } /* Calculate the address of the first meta-block. */ if (realbn >= 0) - metalbn = -(realbn - bn + NIADDR - i); + metalbn = -(realbn - bn + EXT2_NIADDR - i); else - metalbn = -(-realbn - bn + NIADDR - i); + metalbn = -(-realbn - bn + EXT2_NIADDR - i); /* * At each iteration, off is the offset into the bap array which is * an array of disk addresses at the current level of indirection. * The logical block number and the offset in that block are stored * into the argument array. */ ap->in_lbn = metalbn; - ap->in_off = off = NIADDR - i; + ap->in_off = off = EXT2_NIADDR - i; ap++; - for (++numlevels; i <= NIADDR; i++) { + for (++numlevels; i <= EXT2_NIADDR; i++) { /* If searching for a meta-data block, quit when found. */ if (metalbn == realbn) break; off = (bn / blockcnt) % MNINDIR(ump); ++numlevels; ap->in_lbn = metalbn; ap->in_off = off; ++ap; metalbn -= -1 + off * blockcnt; blockcnt /= MNINDIR(ump); } if (nump) *nump = numlevels; return (0); } Index: head/sys/fs/ext2fs/ext2_inode.c =================================================================== --- head/sys/fs/ext2fs/ext2_inode.c (revision 313779) +++ head/sys/fs/ext2fs/ext2_inode.c (revision 313780) @@ -1,528 +1,529 @@ /*- * modified for Lites 1.1 * * Aug 1995, Godmar Back (gback@cs.utah.edu) * University of Utah, Department of Computer Science */ /*- * Copyright (c) 1982, 1986, 1989, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ffs_inode.c 8.5 (Berkeley) 12/30/93 * $FreeBSD$ */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static int ext2_indirtrunc(struct inode *, daddr_t, daddr_t, daddr_t, int, e4fs_daddr_t *); /* * Update the access, modified, and inode change times as specified by the * IN_ACCESS, IN_UPDATE, and IN_CHANGE flags respectively. Write the inode * to disk if the IN_MODIFIED flag is set (it may be set initially, or by * the timestamp update). The IN_LAZYMOD flag is set to force a write * later if not now. If we write now, then clear both IN_MODIFIED and * IN_LAZYMOD to reflect the presumably successful write, and if waitfor is * set, then wait for the write to complete. */ int ext2_update(struct vnode *vp, int waitfor) { struct m_ext2fs *fs; struct buf *bp; struct inode *ip; int error; ASSERT_VOP_ELOCKED(vp, "ext2_update"); ext2_itimes(vp); ip = VTOI(vp); if ((ip->i_flag & IN_MODIFIED) == 0 && waitfor == 0) return (0); ip->i_flag &= ~(IN_LAZYACCESS | IN_LAZYMOD | IN_MODIFIED); fs = ip->i_e2fs; if (fs->e2fs_ronly) return (0); if ((error = bread(ip->i_devvp, fsbtodb(fs, ino_to_fsba(fs, ip->i_number)), (int)fs->e2fs_bsize, NOCRED, &bp)) != 0) { brelse(bp); return (error); } ext2_i2ei(ip, (struct ext2fs_dinode *)((char *)bp->b_data + EXT2_INODE_SIZE(fs) * ino_to_fsbo(fs, ip->i_number))); if (waitfor && !DOINGASYNC(vp)) return (bwrite(bp)); else { bdwrite(bp); return (0); } } #define SINGLE 0 /* index of single indirect block */ #define DOUBLE 1 /* index of double indirect block */ #define TRIPLE 2 /* index of triple indirect block */ /* * Truncate the inode oip to at most length size, freeing the * disk blocks. */ int ext2_truncate(struct vnode *vp, off_t length, int flags, struct ucred *cred, struct thread *td) { struct vnode *ovp = vp; int32_t lastblock; struct inode *oip; - int32_t bn, lbn, lastiblock[NIADDR], indir_lbn[NIADDR]; - uint32_t oldblks[NDADDR + NIADDR], newblks[NDADDR + NIADDR]; + int32_t bn, lbn, lastiblock[EXT2_NIADDR], indir_lbn[EXT2_NIADDR]; + uint32_t oldblks[EXT2_NDADDR + EXT2_NIADDR]; + uint32_t newblks[EXT2_NDADDR + EXT2_NIADDR]; struct m_ext2fs *fs; struct buf *bp; int offset, size, level; e4fs_daddr_t count, nblocks, blocksreleased = 0; int error, i, allerror; off_t osize; #ifdef INVARIANTS struct bufobj *bo; #endif oip = VTOI(ovp); #ifdef INVARIANTS bo = &ovp->v_bufobj; #endif ASSERT_VOP_LOCKED(vp, "ext2_truncate"); if (length < 0) return (EINVAL); if (ovp->v_type == VLNK && oip->i_size < ovp->v_mount->mnt_maxsymlinklen) { #ifdef INVARIANTS if (length != 0) panic("ext2_truncate: partial truncate of symlink"); #endif bzero((char *)&oip->i_shortlink, (u_int)oip->i_size); oip->i_size = 0; oip->i_flag |= IN_CHANGE | IN_UPDATE; return (ext2_update(ovp, 1)); } if (oip->i_size == length) { oip->i_flag |= IN_CHANGE | IN_UPDATE; return (ext2_update(ovp, 0)); } fs = oip->i_e2fs; osize = oip->i_size; /* * Lengthen the size of the file. We must ensure that the * last byte of the file is allocated. Since the smallest * value of osize is 0, length will be at least 1. */ if (osize < length) { if (length > oip->i_e2fs->e2fs_maxfilesize) return (EFBIG); vnode_pager_setsize(ovp, length); offset = blkoff(fs, length - 1); lbn = lblkno(fs, length - 1); flags |= BA_CLRBUF; error = ext2_balloc(oip, lbn, offset + 1, cred, &bp, flags); if (error) { vnode_pager_setsize(vp, osize); return (error); } oip->i_size = length; if (bp->b_bufsize == fs->e2fs_bsize) bp->b_flags |= B_CLUSTEROK; if (flags & IO_SYNC) bwrite(bp); else if (DOINGASYNC(ovp)) bdwrite(bp); else bawrite(bp); oip->i_flag |= IN_CHANGE | IN_UPDATE; return (ext2_update(ovp, !DOINGASYNC(ovp))); } /* * Shorten the size of the file. If the file is not being * truncated to a block boundary, the contents of the * partial block following the end of the file must be * zero'ed in case it ever become accessible again because * of subsequent file growth. */ /* I don't understand the comment above */ offset = blkoff(fs, length); if (offset == 0) { oip->i_size = length; } else { lbn = lblkno(fs, length); flags |= BA_CLRBUF; error = ext2_balloc(oip, lbn, offset, cred, &bp, flags); if (error) return (error); oip->i_size = length; size = blksize(fs, oip, lbn); bzero((char *)bp->b_data + offset, (u_int)(size - offset)); allocbuf(bp, size); if (bp->b_bufsize == fs->e2fs_bsize) bp->b_flags |= B_CLUSTEROK; if (flags & IO_SYNC) bwrite(bp); else if (DOINGASYNC(ovp)) bdwrite(bp); else bawrite(bp); } /* * Calculate index into inode's block list of * last direct and indirect blocks (if any) * which we want to keep. Lastblock is -1 when * the file is truncated to 0. */ lastblock = lblkno(fs, length + fs->e2fs_bsize - 1) - 1; - lastiblock[SINGLE] = lastblock - NDADDR; + lastiblock[SINGLE] = lastblock - EXT2_NDADDR; lastiblock[DOUBLE] = lastiblock[SINGLE] - NINDIR(fs); lastiblock[TRIPLE] = lastiblock[DOUBLE] - NINDIR(fs) * NINDIR(fs); nblocks = btodb(fs->e2fs_bsize); /* * Update file and block pointers on disk before we start freeing * blocks. If we crash before free'ing blocks below, the blocks * will be returned to the free list. lastiblock values are also * normalized to -1 for calls to ext2_indirtrunc below. */ for (level = TRIPLE; level >= SINGLE; level--) { - oldblks[NDADDR + level] = oip->i_ib[level]; + oldblks[EXT2_NDADDR + level] = oip->i_ib[level]; if (lastiblock[level] < 0) { oip->i_ib[level] = 0; lastiblock[level] = -1; } } - for (i = 0; i < NDADDR; i++) { + for (i = 0; i < EXT2_NDADDR; i++) { oldblks[i] = oip->i_db[i]; if (i > lastblock) oip->i_db[i] = 0; } oip->i_flag |= IN_CHANGE | IN_UPDATE; allerror = ext2_update(ovp, !DOINGASYNC(ovp)); /* * Having written the new inode to disk, save its new configuration * and put back the old block pointers long enough to process them. * Note that we save the new block configuration so we can check it * when we are done. */ - for (i = 0; i < NDADDR; i++) { + for (i = 0; i < EXT2_NDADDR; i++) { newblks[i] = oip->i_db[i]; oip->i_db[i] = oldblks[i]; } - for (i = 0; i < NIADDR; i++) { - newblks[NDADDR + i] = oip->i_ib[i]; - oip->i_ib[i] = oldblks[NDADDR + i]; + for (i = 0; i < EXT2_NIADDR; i++) { + newblks[EXT2_NDADDR + i] = oip->i_ib[i]; + oip->i_ib[i] = oldblks[EXT2_NDADDR + i]; } oip->i_size = osize; error = vtruncbuf(ovp, cred, length, (int)fs->e2fs_bsize); if (error && (allerror == 0)) allerror = error; vnode_pager_setsize(ovp, length); /* * Indirect blocks first. */ - indir_lbn[SINGLE] = -NDADDR; + indir_lbn[SINGLE] = -EXT2_NDADDR; indir_lbn[DOUBLE] = indir_lbn[SINGLE] - NINDIR(fs) - 1; indir_lbn[TRIPLE] = indir_lbn[DOUBLE] - NINDIR(fs) * NINDIR(fs) - 1; for (level = TRIPLE; level >= SINGLE; level--) { bn = oip->i_ib[level]; if (bn != 0) { error = ext2_indirtrunc(oip, indir_lbn[level], fsbtodb(fs, bn), lastiblock[level], level, &count); if (error) allerror = error; blocksreleased += count; if (lastiblock[level] < 0) { oip->i_ib[level] = 0; ext2_blkfree(oip, bn, fs->e2fs_fsize); blocksreleased += nblocks; } } if (lastiblock[level] >= 0) goto done; } /* * All whole direct blocks or frags. */ - for (i = NDADDR - 1; i > lastblock; i--) { + for (i = EXT2_NDADDR - 1; i > lastblock; i--) { long bsize; bn = oip->i_db[i]; if (bn == 0) continue; oip->i_db[i] = 0; bsize = blksize(fs, oip, i); ext2_blkfree(oip, bn, bsize); blocksreleased += btodb(bsize); } if (lastblock < 0) goto done; /* * Finally, look for a change in size of the * last direct block; release any frags. */ bn = oip->i_db[lastblock]; if (bn != 0) { long oldspace, newspace; /* * Calculate amount of space we're giving * back as old block size minus new block size. */ oldspace = blksize(fs, oip, lastblock); oip->i_size = length; newspace = blksize(fs, oip, lastblock); if (newspace == 0) panic("ext2_truncate: newspace"); if (oldspace - newspace > 0) { /* * Block number of space to be free'd is * the old block # plus the number of frags * required for the storage we're keeping. */ bn += numfrags(fs, newspace); ext2_blkfree(oip, bn, oldspace - newspace); blocksreleased += btodb(oldspace - newspace); } } done: #ifdef INVARIANTS for (level = SINGLE; level <= TRIPLE; level++) - if (newblks[NDADDR + level] != oip->i_ib[level]) + if (newblks[EXT2_NDADDR + level] != oip->i_ib[level]) panic("itrunc1"); - for (i = 0; i < NDADDR; i++) + for (i = 0; i < EXT2_NDADDR; i++) if (newblks[i] != oip->i_db[i]) panic("itrunc2"); BO_LOCK(bo); if (length == 0 && (bo->bo_dirty.bv_cnt != 0 || bo->bo_clean.bv_cnt != 0)) panic("itrunc3"); BO_UNLOCK(bo); #endif /* INVARIANTS */ /* * Put back the real size. */ oip->i_size = length; if (oip->i_blocks >= blocksreleased) oip->i_blocks -= blocksreleased; else /* sanity */ oip->i_blocks = 0; oip->i_flag |= IN_CHANGE; vnode_pager_setsize(ovp, length); return (allerror); } /* * Release blocks associated with the inode ip and stored in the indirect * block bn. Blocks are free'd in LIFO order up to (but not including) * lastbn. If level is greater than SINGLE, the block is an indirect block * and recursive calls to indirtrunc must be used to cleanse other indirect * blocks. * * NB: triple indirect blocks are untested. */ static int ext2_indirtrunc(struct inode *ip, daddr_t lbn, daddr_t dbn, daddr_t lastbn, int level, e4fs_daddr_t *countp) { struct buf *bp; struct m_ext2fs *fs = ip->i_e2fs; struct vnode *vp; e2fs_daddr_t *bap, *copy; int i, nblocks, error = 0, allerror = 0; e2fs_lbn_t nb, nlbn, last; e4fs_daddr_t blkcount, factor, blocksreleased = 0; /* * Calculate index in current block of last * block to be kept. -1 indicates the entire * block so we need not calculate the index. */ factor = 1; for (i = SINGLE; i < level; i++) factor *= NINDIR(fs); last = lastbn; if (lastbn > 0) last /= factor; nblocks = btodb(fs->e2fs_bsize); /* * Get buffer of block pointers, zero those entries corresponding * to blocks to be free'd, and update on disk copy first. Since * double(triple) indirect before single(double) indirect, calls * to bmap on these blocks will fail. However, we already have * the on disk address, so we have to set the b_blkno field * explicitly instead of letting bread do everything for us. */ vp = ITOV(ip); bp = getblk(vp, lbn, (int)fs->e2fs_bsize, 0, 0, 0); if ((bp->b_flags & (B_DONE | B_DELWRI)) == 0) { bp->b_iocmd = BIO_READ; if (bp->b_bcount > bp->b_bufsize) panic("ext2_indirtrunc: bad buffer size"); bp->b_blkno = dbn; vfs_busy_pages(bp, 0); bp->b_iooffset = dbtob(bp->b_blkno); bstrategy(bp); error = bufwait(bp); } if (error) { brelse(bp); *countp = 0; return (error); } bap = (e2fs_daddr_t *)bp->b_data; copy = malloc(fs->e2fs_bsize, M_TEMP, M_WAITOK); bcopy((caddr_t)bap, (caddr_t)copy, (u_int)fs->e2fs_bsize); bzero((caddr_t)&bap[last + 1], (NINDIR(fs) - (last + 1)) * sizeof(e2fs_daddr_t)); if (last == -1) bp->b_flags |= B_INVAL; if (DOINGASYNC(vp)) { bdwrite(bp); } else { error = bwrite(bp); if (error) allerror = error; } bap = copy; /* * Recursively free totally unused blocks. */ for (i = NINDIR(fs) - 1, nlbn = lbn + 1 - i * factor; i > last; i--, nlbn += factor) { nb = bap[i]; if (nb == 0) continue; if (level > SINGLE) { if ((error = ext2_indirtrunc(ip, nlbn, fsbtodb(fs, nb), (int32_t)-1, level - 1, &blkcount)) != 0) allerror = error; blocksreleased += blkcount; } ext2_blkfree(ip, nb, fs->e2fs_bsize); blocksreleased += nblocks; } /* * Recursively free last partial block. */ if (level > SINGLE && lastbn >= 0) { last = lastbn % factor; nb = bap[i]; if (nb != 0) { if ((error = ext2_indirtrunc(ip, nlbn, fsbtodb(fs, nb), last, level - 1, &blkcount)) != 0) allerror = error; blocksreleased += blkcount; } } free(copy, M_TEMP); *countp = blocksreleased; return (allerror); } /* * discard preallocated blocks */ int ext2_inactive(struct vop_inactive_args *ap) { struct vnode *vp = ap->a_vp; struct inode *ip = VTOI(vp); struct thread *td = ap->a_td; int mode, error = 0; /* * Ignore inodes related to stale file handles. */ if (ip->i_mode == 0) goto out; if (ip->i_nlink <= 0) { error = ext2_truncate(vp, (off_t)0, 0, NOCRED, td); ip->i_rdev = 0; mode = ip->i_mode; ip->i_mode = 0; ip->i_flag |= IN_CHANGE | IN_UPDATE; ext2_vfree(vp, ip->i_number, mode); } if (ip->i_flag & (IN_ACCESS | IN_CHANGE | IN_MODIFIED | IN_UPDATE)) ext2_update(vp, 0); out: /* * If we are done with the inode, reclaim it * so that it can be reused immediately. */ if (ip->i_mode == 0) vrecycle(vp); return (error); } /* * Reclaim an inode so that it can be used for other purposes. */ int ext2_reclaim(struct vop_reclaim_args *ap) { struct inode *ip; struct vnode *vp = ap->a_vp; ip = VTOI(vp); if (ip->i_flag & IN_LAZYMOD) { ip->i_flag |= IN_MODIFIED; ext2_update(vp, 0); } vfs_hash_remove(vp); free(vp->v_data, M_EXT2NODE); vp->v_data = 0; vnode_destroy_vobject(vp); return (0); } Index: head/sys/fs/ext2fs/ext2_inode_cnv.c =================================================================== --- head/sys/fs/ext2fs/ext2_inode_cnv.c (revision 313779) +++ head/sys/fs/ext2fs/ext2_inode_cnv.c (revision 313780) @@ -1,176 +1,176 @@ /*- * Copyright (c) 1995 The University of Utah and * the Computer Systems Laboratory at the University of Utah (CSL). * All rights reserved. * * Permission to use, copy, modify and distribute this software is hereby * granted provided that (1) source code retains these copyright, permission, * and disclaimer notices, and (2) redistributions including binaries * reproduce the notices in supporting documentation, and (3) all advertising * materials mentioning features or use of this software display the following * acknowledgement: ``This product includes software developed by the * Computer Systems Laboratory at the University of Utah.'' * * THE UNIVERSITY OF UTAH AND CSL ALLOW FREE USE OF THIS SOFTWARE IN ITS "AS * IS" CONDITION. THE UNIVERSITY OF UTAH AND CSL DISCLAIM ANY LIABILITY OF * ANY KIND FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. * * CSL requests users of this software to return to csl-dist@cs.utah.edu any * improvements that they make and grant CSL redistribution rights. * * Utah $Hdr$ * $FreeBSD$ */ /* * routines to convert on disk ext2 inodes into inodes and back */ #include #include #include #include #include #include #include #include #include #include #include #define XTIME_TO_NSEC(x) ((x & EXT3_NSEC_MASK) >> 2) #define NSEC_TO_XTIME(t) (le32toh(t << 2) & EXT3_NSEC_MASK) #ifdef EXT2FS_DEBUG void ext2_print_inode(struct inode *in) { int i; struct ext4_extent_header *ehp; struct ext4_extent *ep; printf("Inode: %5ju", (uintmax_t)in->i_number); printf( /* "Inode: %5d" */ " Type: %10s Mode: 0x%o Flags: 0x%x Version: %d\n", "n/a", in->i_mode, in->i_flags, in->i_gen); printf("User: %5u Group: %5u Size: %ju\n", in->i_uid, in->i_gid, (uintmax_t)in->i_size); printf("Links: %3d Blockcount: %ju\n", in->i_nlink, (uintmax_t)in->i_blocks); printf("ctime: 0x%x", in->i_ctime); printf("atime: 0x%x", in->i_atime); printf("mtime: 0x%x", in->i_mtime); if (E2DI_HAS_XTIME(in)) printf("crtime %#x ", in->i_birthtime); printf("BLOCKS:"); for (i = 0; i < (in->i_blocks <= 24 ? (in->i_blocks + 1) / 2 : 12); i++) printf(" %d", in->i_db[i]); printf("\n"); printf("Extents:\n"); ehp = (struct ext4_extent_header *)in->i_db; printf("Header (magic 0x%x entries %d max %d depth %d gen %d)\n", ehp->eh_magic, ehp->eh_ecount, ehp->eh_max, ehp->eh_depth, ehp->eh_gen); ep = (struct ext4_extent *)(char *)(ehp + 1); printf("Index (blk %d len %d start_lo %d start_hi %d)\n", ep->e_blk, ep->e_len, ep->e_start_lo, ep->e_start_hi); printf("\n"); } #endif /* EXT2FS_DEBUG */ /* * raw ext2 inode to inode */ void ext2_ei2i(struct ext2fs_dinode *ei, struct inode *ip) { int i; ip->i_nlink = ei->e2di_nlink; /* * Godmar thinks - if the link count is zero, then the inode is * unused - according to ext2 standards. Ufs marks this fact by * setting i_mode to zero - why ? I can see that this might lead to * problems in an undelete. */ ip->i_mode = ei->e2di_nlink ? ei->e2di_mode : 0; ip->i_size = ei->e2di_size; if (S_ISREG(ip->i_mode)) ip->i_size |= ((u_int64_t)ei->e2di_size_high) << 32; ip->i_atime = ei->e2di_atime; ip->i_mtime = ei->e2di_mtime; ip->i_ctime = ei->e2di_ctime; if (E2DI_HAS_XTIME(ip)) { ip->i_atimensec = XTIME_TO_NSEC(ei->e2di_atime_extra); ip->i_mtimensec = XTIME_TO_NSEC(ei->e2di_mtime_extra); ip->i_ctimensec = XTIME_TO_NSEC(ei->e2di_ctime_extra); ip->i_birthtime = ei->e2di_crtime; ip->i_birthnsec = XTIME_TO_NSEC(ei->e2di_crtime_extra); } ip->i_flags = 0; ip->i_flags |= (ei->e2di_flags & EXT2_APPEND) ? SF_APPEND : 0; ip->i_flags |= (ei->e2di_flags & EXT2_IMMUTABLE) ? SF_IMMUTABLE : 0; ip->i_flags |= (ei->e2di_flags & EXT2_NODUMP) ? UF_NODUMP : 0; ip->i_flag |= (ei->e2di_flags & EXT3_INDEX) ? IN_E3INDEX : 0; ip->i_flag |= (ei->e2di_flags & EXT4_EXTENTS) ? IN_E4EXTENTS : 0; ip->i_blocks = ei->e2di_nblock; if (E2DI_HAS_HUGE_FILE(ip)) { ip->i_blocks |= (uint64_t)ei->e2di_nblock_high << 32; if (ei->e2di_flags & EXT4_HUGE_FILE) ip->i_blocks = fsbtodb(ip->i_e2fs, ip->i_blocks); } ip->i_gen = ei->e2di_gen; ip->i_uid = ei->e2di_uid; ip->i_gid = ei->e2di_gid; /* XXX use memcpy */ - for (i = 0; i < NDADDR; i++) + for (i = 0; i < EXT2_NDADDR; i++) ip->i_db[i] = ei->e2di_blocks[i]; - for (i = 0; i < NIADDR; i++) + for (i = 0; i < EXT2_NIADDR; i++) ip->i_ib[i] = ei->e2di_blocks[EXT2_NDIR_BLOCKS + i]; } /* * inode to raw ext2 inode */ void ext2_i2ei(struct inode *ip, struct ext2fs_dinode *ei) { int i; ei->e2di_mode = ip->i_mode; ei->e2di_nlink = ip->i_nlink; /* * Godmar thinks: if dtime is nonzero, ext2 says this inode has been * deleted, this would correspond to a zero link count */ ei->e2di_dtime = ei->e2di_nlink ? 0 : ip->i_mtime; ei->e2di_size = ip->i_size; if (S_ISREG(ip->i_mode)) ei->e2di_size_high = ip->i_size >> 32; ei->e2di_atime = ip->i_atime; ei->e2di_mtime = ip->i_mtime; ei->e2di_ctime = ip->i_ctime; if (E2DI_HAS_XTIME(ip)) { ei->e2di_ctime_extra = NSEC_TO_XTIME(ip->i_ctimensec); ei->e2di_mtime_extra = NSEC_TO_XTIME(ip->i_mtimensec); ei->e2di_atime_extra = NSEC_TO_XTIME(ip->i_atimensec); ei->e2di_crtime = ip->i_birthtime; ei->e2di_crtime_extra = NSEC_TO_XTIME(ip->i_birthnsec); } ei->e2di_flags = 0; ei->e2di_flags |= (ip->i_flags & SF_APPEND) ? EXT2_APPEND : 0; ei->e2di_flags |= (ip->i_flags & SF_IMMUTABLE) ? EXT2_IMMUTABLE : 0; ei->e2di_flags |= (ip->i_flags & UF_NODUMP) ? EXT2_NODUMP : 0; ei->e2di_flags |= (ip->i_flag & IN_E3INDEX) ? EXT3_INDEX : 0; ei->e2di_flags |= (ip->i_flag & IN_E4EXTENTS) ? EXT4_EXTENTS : 0; ei->e2di_nblock = ip->i_blocks & 0xffffffff; ei->e2di_nblock_high = ip->i_blocks >> 32 & 0xffff; ei->e2di_gen = ip->i_gen; ei->e2di_uid = ip->i_uid; ei->e2di_gid = ip->i_gid; /* XXX use memcpy */ - for (i = 0; i < NDADDR; i++) + for (i = 0; i < EXT2_NDADDR; i++) ei->e2di_blocks[i] = ip->i_db[i]; - for (i = 0; i < NIADDR; i++) + for (i = 0; i < EXT2_NIADDR; i++) ei->e2di_blocks[EXT2_NDIR_BLOCKS + i] = ip->i_ib[i]; } Index: head/sys/fs/ext2fs/inode.h =================================================================== --- head/sys/fs/ext2fs/inode.h (revision 313779) +++ head/sys/fs/ext2fs/inode.h (revision 313780) @@ -1,188 +1,188 @@ /*- * Copyright (c) 1982, 1989, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)inode.h 8.9 (Berkeley) 5/14/95 * $FreeBSD$ */ #ifndef _FS_EXT2FS_INODE_H_ #define _FS_EXT2FS_INODE_H_ #include #include #include #include #include /* * This must agree with the definition in . */ #define doff_t int32_t -#define NDADDR 12 /* Direct addresses in inode. */ -#define NIADDR 3 /* Indirect addresses in inode. */ +#define EXT2_NDADDR 12 /* Direct addresses in inode. */ +#define EXT2_NIADDR 3 /* Indirect addresses in inode. */ /* * The size of physical and logical block numbers in EXT2FS. */ typedef uint32_t e2fs_daddr_t; typedef int64_t e2fs_lbn_t; typedef int64_t e4fs_daddr_t; /* * The inode is used to describe each active (or recently active) file in the * EXT2FS filesystem. It is composed of two types of information. The first * part is the information that is needed only while the file is active (such * as the identity of the file and linkage to speed its lookup). The second * part is the permanent meta-data associated with the file which is read in * from the permanent dinode from long term storage when the file becomes * active, and is put back when the file is no longer being used. */ struct inode { struct vnode *i_vnode;/* Vnode associated with this inode. */ struct ext2mount *i_ump; uint32_t i_flag; /* flags, see below */ ino_t i_number; /* The identity of the inode. */ struct m_ext2fs *i_e2fs; /* EXT2FS */ u_quad_t i_modrev; /* Revision level for NFS lease. */ /* * Side effects; used during directory lookup. */ int32_t i_count; /* Size of free slot in directory. */ doff_t i_endoff; /* End of useful stuff in directory. */ doff_t i_diroff; /* Offset in dir, where we found last entry. */ doff_t i_offset; /* Offset of free space in directory. */ uint32_t i_block_group; uint32_t i_next_alloc_block; uint32_t i_next_alloc_goal; /* Fields from struct dinode in UFS. */ uint16_t i_mode; /* IFMT, permissions; see below. */ int16_t i_nlink; /* File link count. */ uint32_t i_uid; /* File owner. */ uint32_t i_gid; /* File group. */ uint64_t i_size; /* File byte count. */ uint64_t i_blocks; /* Blocks actually held. */ int32_t i_atime; /* Last access time. */ int32_t i_mtime; /* Last modified time. */ int32_t i_ctime; /* Last inode change time. */ int32_t i_birthtime; /* Inode creation time. */ int32_t i_mtimensec; /* Last modified time. */ int32_t i_atimensec; /* Last access time. */ int32_t i_ctimensec; /* Last inode change time. */ int32_t i_birthnsec; /* Inode creation time. */ uint32_t i_gen; /* Generation number. */ uint32_t i_flags; /* Status flags (chflags). */ - uint32_t i_db[NDADDR]; /* Direct disk blocks. */ - uint32_t i_ib[NIADDR]; /* Indirect disk blocks. */ + uint32_t i_db[EXT2_NDADDR]; /* Direct disk blocks. */ + uint32_t i_ib[EXT2_NIADDR]; /* Indirect disk blocks. */ struct ext4_extent_cache i_ext_cache; /* cache for ext4 extent */ }; /* * The di_db fields may be overlaid with other information for * file types that do not have associated disk storage. Block * and character devices overlay the first data block with their * dev_t value. Short symbolic links place their path in the * di_db area. */ #define i_shortlink i_db #define i_rdev i_db[0] /* File permissions. */ #define IEXEC 0000100 /* Executable. */ #define IWRITE 0000200 /* Writeable. */ #define IREAD 0000400 /* Readable. */ #define ISVTX 0001000 /* Sticky bit. */ #define ISGID 0002000 /* Set-gid. */ #define ISUID 0004000 /* Set-uid. */ /* File types. */ #define IFMT 0170000 /* Mask of file type. */ #define IFIFO 0010000 /* Named pipe (fifo). */ #define IFCHR 0020000 /* Character device. */ #define IFDIR 0040000 /* Directory file. */ #define IFBLK 0060000 /* Block device. */ #define IFREG 0100000 /* Regular file. */ #define IFLNK 0120000 /* Symbolic link. */ #define IFSOCK 0140000 /* UNIX domain socket. */ #define IFWHT 0160000 /* Whiteout. */ /* These flags are kept in i_flag. */ #define IN_ACCESS 0x0001 /* Access time update request. */ #define IN_CHANGE 0x0002 /* Inode change time update request. */ #define IN_UPDATE 0x0004 /* Modification time update request. */ #define IN_MODIFIED 0x0008 /* Inode has been modified. */ #define IN_RENAME 0x0010 /* Inode is being renamed. */ #define IN_HASHED 0x0020 /* Inode is on hash list */ #define IN_LAZYMOD 0x0040 /* Modified, but don't write yet. */ #define IN_SPACECOUNTED 0x0080 /* Blocks to be freed in free count. */ #define IN_LAZYACCESS 0x0100 /* Process IN_ACCESS after the suspension finished */ /* * These are translation flags for some attributes that Ext4 * passes as inode flags but that we cannot pass directly. */ #define IN_E3INDEX 0x010000 #define IN_E4EXTENTS 0x020000 #define i_devvp i_ump->um_devvp #ifdef _KERNEL /* * Structure used to pass around logical block paths generated by * ext2_getlbns and used by truncate and bmap code. */ struct indir { e2fs_lbn_t in_lbn; /* Logical block number. */ int in_off; /* Offset in buffer. */ }; /* Convert between inode pointers and vnode pointers. */ #define VTOI(vp) ((struct inode *)(vp)->v_data) #define ITOV(ip) ((ip)->i_vnode) /* This overlays the fid structure (see mount.h). */ struct ufid { uint16_t ufid_len; /* Length of structure. */ uint16_t ufid_pad; /* Force 32-bit alignment. */ ino_t ufid_ino; /* File number (ino). */ uint32_t ufid_gen; /* Generation number. */ }; #endif /* _KERNEL */ #endif /* !_FS_EXT2FS_INODE_H_ */ Index: head/sys/fs/nandfs/bmap.c =================================================================== --- head/sys/fs/nandfs/bmap.c (revision 313779) +++ head/sys/fs/nandfs/bmap.c (revision 313780) @@ -1,623 +1,623 @@ /*- * Copyright (c) 2012 Semihalf * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "nandfs_mount.h" #include "nandfs.h" #include "nandfs_subr.h" #include "bmap.h" static int bmap_getlbns(struct nandfs_node *, nandfs_lbn_t, struct nandfs_indir *, int *); int bmap_lookup(struct nandfs_node *node, nandfs_lbn_t lblk, nandfs_daddr_t *vblk) { struct nandfs_inode *ip; - struct nandfs_indir a[NIADDR + 1], *ap; + struct nandfs_indir a[NANDFS_NIADDR + 1], *ap; nandfs_daddr_t daddr; struct buf *bp; int error; int num, *nump; DPRINTF(BMAP, ("%s: node %p lblk %jx enter\n", __func__, node, lblk)); ip = &node->nn_inode; ap = a; nump = # error = bmap_getlbns(node, lblk, ap, nump); if (error) return (error); if (num == 0) { *vblk = ip->i_db[lblk]; return (0); } DPRINTF(BMAP, ("%s: node %p lblk=%jx trying ip->i_ib[%x]\n", __func__, node, lblk, ap->in_off)); daddr = ip->i_ib[ap->in_off]; for (bp = NULL, ++ap; --num; ap++) { if (daddr == 0) { DPRINTF(BMAP, ("%s: node %p lblk=%jx returning with " "vblk 0\n", __func__, node, lblk)); *vblk = 0; return (0); } if (ap->in_lbn == lblk) { DPRINTF(BMAP, ("%s: node %p lblk=%jx ap->in_lbn=%jx " "returning address of indirect block (%jx)\n", __func__, node, lblk, ap->in_lbn, daddr)); *vblk = daddr; return (0); } DPRINTF(BMAP, ("%s: node %p lblk=%jx reading block " "ap->in_lbn=%jx\n", __func__, node, lblk, ap->in_lbn)); error = nandfs_bread_meta(node, ap->in_lbn, NOCRED, 0, &bp); if (error) { brelse(bp); return (error); } daddr = ((nandfs_daddr_t *)bp->b_data)[ap->in_off]; brelse(bp); } DPRINTF(BMAP, ("%s: node %p lblk=%jx returning with %jx\n", __func__, node, lblk, daddr)); *vblk = daddr; return (0); } int bmap_dirty_meta(struct nandfs_node *node, nandfs_lbn_t lblk, int force) { - struct nandfs_indir a[NIADDR+1], *ap; + struct nandfs_indir a[NANDFS_NIADDR+1], *ap; #ifdef DEBUG nandfs_daddr_t daddr; #endif struct buf *bp; int error; int num, *nump; DPRINTF(BMAP, ("%s: node %p lblk=%jx\n", __func__, node, lblk)); ap = a; nump = # error = bmap_getlbns(node, lblk, ap, nump); if (error) return (error); /* * Direct block, nothing to do */ if (num == 0) return (0); DPRINTF(BMAP, ("%s: node %p reading blocks\n", __func__, node)); for (bp = NULL, ++ap; --num; ap++) { error = nandfs_bread_meta(node, ap->in_lbn, NOCRED, 0, &bp); if (error) { brelse(bp); return (error); } #ifdef DEBUG daddr = ((nandfs_daddr_t *)bp->b_data)[ap->in_off]; MPASS(daddr != 0 || node->nn_ino == 3); #endif error = nandfs_dirty_buf_meta(bp, force); if (error) return (error); } return (0); } int bmap_insert_block(struct nandfs_node *node, nandfs_lbn_t lblk, nandfs_daddr_t vblk) { struct nandfs_inode *ip; - struct nandfs_indir a[NIADDR+1], *ap; + struct nandfs_indir a[NANDFS_NIADDR+1], *ap; struct buf *bp; nandfs_daddr_t daddr; int error; int num, *nump, i; DPRINTF(BMAP, ("%s: node %p lblk=%jx vblk=%jx\n", __func__, node, lblk, vblk)); ip = &node->nn_inode; ap = a; nump = # error = bmap_getlbns(node, lblk, ap, nump); if (error) return (error); DPRINTF(BMAP, ("%s: node %p lblk=%jx vblk=%jx got num=%d\n", __func__, node, lblk, vblk, num)); if (num == 0) { DPRINTF(BMAP, ("%s: node %p lblk=%jx direct block\n", __func__, node, lblk)); ip->i_db[lblk] = vblk; return (0); } DPRINTF(BMAP, ("%s: node %p lblk=%jx indirect block level %d\n", __func__, node, lblk, ap->in_off)); if (num == 1) { DPRINTF(BMAP, ("%s: node %p lblk=%jx indirect block: inserting " "%jx as vblk for indirect block %d\n", __func__, node, lblk, vblk, ap->in_off)); ip->i_ib[ap->in_off] = vblk; return (0); } bp = NULL; daddr = ip->i_ib[a[0].in_off]; for (i = 1; i < num; i++) { if (bp) brelse(bp); if (daddr == 0) { DPRINTF(BMAP, ("%s: node %p lblk=%jx vblk=%jx create " "block %jx %d\n", __func__, node, lblk, vblk, a[i].in_lbn, a[i].in_off)); error = nandfs_bcreate_meta(node, a[i].in_lbn, NOCRED, 0, &bp); if (error) return (error); } else { DPRINTF(BMAP, ("%s: node %p lblk=%jx vblk=%jx read " "block %jx %d\n", __func__, node, daddr, vblk, a[i].in_lbn, a[i].in_off)); error = nandfs_bread_meta(node, a[i].in_lbn, NOCRED, 0, &bp); if (error) { brelse(bp); return (error); } } daddr = ((nandfs_daddr_t *)bp->b_data)[a[i].in_off]; } i--; DPRINTF(BMAP, ("%s: bmap node %p lblk=%jx vblk=%jx inserting vblk level %d at " "offset %d at %jx\n", __func__, node, lblk, vblk, i, a[i].in_off, daddr)); if (!bp) { nandfs_error("%s: cannot find indirect block\n", __func__); return (-1); } ((nandfs_daddr_t *)bp->b_data)[a[i].in_off] = vblk; error = nandfs_dirty_buf_meta(bp, 0); if (error) { nandfs_warning("%s: dirty failed buf: %p\n", __func__, bp); return (error); } DPRINTF(BMAP, ("%s: exiting node %p lblk=%jx vblk=%jx\n", __func__, node, lblk, vblk)); return (error); } -CTASSERT(NIADDR <= 3); +CTASSERT(NANDFS_NIADDR <= 3); #define SINGLE 0 /* index of single indirect block */ #define DOUBLE 1 /* index of double indirect block */ #define TRIPLE 2 /* index of triple indirect block */ static __inline nandfs_lbn_t lbn_offset(struct nandfs_device *fsdev, int level) { nandfs_lbn_t res; for (res = 1; level > 0; level--) res *= MNINDIR(fsdev); return (res); } static nandfs_lbn_t blocks_inside(struct nandfs_device *fsdev, int level, struct nandfs_indir *nip) { nandfs_lbn_t blocks; for (blocks = 1; level >= SINGLE; level--, nip++) { MPASS(nip->in_off >= 0 && nip->in_off < MNINDIR(fsdev)); blocks += nip->in_off * lbn_offset(fsdev, level); } return (blocks); } static int bmap_truncate_indirect(struct nandfs_node *node, int level, nandfs_lbn_t *left, int *cleaned, struct nandfs_indir *ap, struct nandfs_indir *fp, nandfs_daddr_t *copy) { struct buf *bp; nandfs_lbn_t i, lbn, nlbn, factor, tosub; struct nandfs_device *fsdev; int error, lcleaned, modified; DPRINTF(BMAP, ("%s: node %p level %d left %jx\n", __func__, node, level, *left)); fsdev = node->nn_nandfsdev; MPASS(ap->in_off >= 0 && ap->in_off < MNINDIR(fsdev)); factor = lbn_offset(fsdev, level); lbn = ap->in_lbn; error = nandfs_bread_meta(node, lbn, NOCRED, 0, &bp); if (error) { if (bp != NULL) brelse(bp); return (error); } bcopy(bp->b_data, copy, fsdev->nd_blocksize); bqrelse(bp); modified = 0; i = ap->in_off; if (ap != fp) ap++; for (nlbn = lbn + 1 - i * factor; i >= 0 && *left > 0; i--, nlbn += factor) { lcleaned = 0; DPRINTF(BMAP, ("%s: node %p i=%jx nlbn=%jx left=%jx ap=%p vblk %jx\n", __func__, node, i, nlbn, *left, ap, copy[i])); if (copy[i] == 0) { tosub = blocks_inside(fsdev, level - 1, ap); if (tosub > *left) tosub = 0; *left -= tosub; } else { if (level > SINGLE) { if (ap == fp) ap->in_lbn = nlbn; error = bmap_truncate_indirect(node, level - 1, left, &lcleaned, ap, fp, copy + MNINDIR(fsdev)); if (error) return (error); } else { error = nandfs_bdestroy(node, copy[i]); if (error) return (error); lcleaned = 1; *left -= 1; } } if (lcleaned) { if (level > SINGLE) { error = nandfs_vblock_end(fsdev, copy[i]); if (error) return (error); } copy[i] = 0; modified++; } ap = fp; } if (i == -1) *cleaned = 1; error = nandfs_bread_meta(node, lbn, NOCRED, 0, &bp); if (error) { brelse(bp); return (error); } if (modified) bcopy(copy, bp->b_data, fsdev->nd_blocksize); /* Force success even if we can't dirty the buffer metadata when freeing space */ nandfs_dirty_buf_meta(bp, 1); return (0); } int bmap_truncate_mapping(struct nandfs_node *node, nandfs_lbn_t lastblk, nandfs_lbn_t todo) { struct nandfs_inode *ip; - struct nandfs_indir a[NIADDR + 1], f[NIADDR], *ap; - nandfs_daddr_t indir_lbn[NIADDR]; + struct nandfs_indir a[NANDFS_NIADDR + 1], f[NANDFS_NIADDR], *ap; + nandfs_daddr_t indir_lbn[NANDFS_NIADDR]; nandfs_daddr_t *copy; int error, level; nandfs_lbn_t left, tosub; struct nandfs_device *fsdev; int cleaned, i; int num, *nump; DPRINTF(BMAP, ("%s: node %p lastblk %jx truncating by %jx\n", __func__, node, lastblk, todo)); ip = &node->nn_inode; fsdev = node->nn_nandfsdev; ap = a; nump = # error = bmap_getlbns(node, lastblk, ap, nump); if (error) return (error); - indir_lbn[SINGLE] = -NDADDR; + indir_lbn[SINGLE] = -NANDFS_NDADDR; indir_lbn[DOUBLE] = indir_lbn[SINGLE] - MNINDIR(fsdev) - 1; indir_lbn[TRIPLE] = indir_lbn[DOUBLE] - MNINDIR(fsdev) * MNINDIR(fsdev) - 1; - for (i = 0; i < NIADDR; i++) { + for (i = 0; i < NANDFS_NIADDR; i++) { f[i].in_off = MNINDIR(fsdev) - 1; f[i].in_lbn = 0xdeadbeef; } left = todo; #ifdef DEBUG a[num].in_off = -1; #endif ap++; num -= 2; if (num < 0) goto direct; copy = malloc(MNINDIR(fsdev) * sizeof(nandfs_daddr_t) * (num + 1), M_NANDFSTEMP, M_WAITOK); for (level = num; level >= SINGLE && left > 0; level--) { cleaned = 0; if (ip->i_ib[level] == 0) { tosub = blocks_inside(fsdev, level, ap); if (tosub > left) left = 0; else left -= tosub; } else { if (ap == f) ap->in_lbn = indir_lbn[level]; error = bmap_truncate_indirect(node, level, &left, &cleaned, ap, f, copy); if (error) { free(copy, M_NANDFSTEMP); nandfs_error("%s: error %d when truncate " "at level %d\n", __func__, error, level); return (error); } } if (cleaned) { nandfs_vblock_end(fsdev, ip->i_ib[level]); ip->i_ib[level] = 0; } ap = f; } free(copy, M_NANDFSTEMP); direct: if (num < 0) i = lastblk; else - i = NDADDR - 1; + i = NANDFS_NDADDR - 1; for (; i >= 0 && left > 0; i--) { if (ip->i_db[i] != 0) { error = nandfs_bdestroy(node, ip->i_db[i]); if (error) { nandfs_error("%s: cannot destroy " "block %jx, error %d\n", __func__, (uintmax_t)ip->i_db[i], error); return (error); } ip->i_db[i] = 0; } left--; } KASSERT(left == 0, ("truncated wrong number of blocks (%jd should be 0)", left)); return (error); } nandfs_lbn_t get_maxfilesize(struct nandfs_device *fsdev) { - struct nandfs_indir f[NIADDR]; + struct nandfs_indir f[NANDFS_NIADDR]; nandfs_lbn_t max; int i; - max = NDADDR; + max = NANDFS_NDADDR; - for (i = 0; i < NIADDR; i++) { + for (i = 0; i < NANDFS_NIADDR; i++) { f[i].in_off = MNINDIR(fsdev) - 1; max += blocks_inside(fsdev, i, f); } max *= fsdev->nd_blocksize; return (max); } /* * This is ufs_getlbns with minor modifications. */ /* * Create an array of logical block number/offset pairs which represent the * path of indirect blocks required to access a data block. The first "pair" * contains the logical block number of the appropriate single, double or * triple indirect block and the offset into the inode indirect block array. * Note, the logical block number of the inode single/double/triple indirect * block appears twice in the array, once with the offset into the i_ib and * once with the offset into the page itself. */ static int bmap_getlbns(struct nandfs_node *node, nandfs_lbn_t bn, struct nandfs_indir *ap, int *nump) { nandfs_daddr_t blockcnt; nandfs_lbn_t metalbn, realbn; struct nandfs_device *fsdev; int i, numlevels, off; fsdev = node->nn_nandfsdev; DPRINTF(BMAP, ("%s: node %p bn=%jx mnindir=%zd enter\n", __func__, node, bn, MNINDIR(fsdev))); if (nump) *nump = 0; numlevels = 0; realbn = bn; if (bn < 0) bn = -bn; - /* The first NDADDR blocks are direct blocks. */ - if (bn < NDADDR) + /* The first NANDFS_NDADDR blocks are direct blocks. */ + if (bn < NANDFS_NDADDR) return (0); /* * Determine the number of levels of indirection. After this loop * is done, blockcnt indicates the number of data blocks possible - * at the previous level of indirection, and NIADDR - i is the number - * of levels of indirection needed to locate the requested block. + * at the previous level of indirection, and NANDFS_NIADDR - i is the + * number of levels of indirection needed to locate the requested block. */ - for (blockcnt = 1, i = NIADDR, bn -= NDADDR;; i--, bn -= blockcnt) { + for (blockcnt = 1, i = NANDFS_NIADDR, bn -= NANDFS_NDADDR;; i--, bn -= blockcnt) { DPRINTF(BMAP, ("%s: blockcnt=%jd i=%d bn=%jd\n", __func__, blockcnt, i, bn)); if (i == 0) return (EFBIG); blockcnt *= MNINDIR(fsdev); if (bn < blockcnt) break; } /* Calculate the address of the first meta-block. */ if (realbn >= 0) - metalbn = -(realbn - bn + NIADDR - i); + metalbn = -(realbn - bn + NANDFS_NIADDR - i); else - metalbn = -(-realbn - bn + NIADDR - i); + metalbn = -(-realbn - bn + NANDFS_NIADDR - i); /* * At each iteration, off is the offset into the bap array which is * an array of disk addresses at the current level of indirection. * The logical block number and the offset in that block are stored * into the argument array. */ ap->in_lbn = metalbn; - ap->in_off = off = NIADDR - i; + ap->in_off = off = NANDFS_NIADDR - i; DPRINTF(BMAP, ("%s: initial: ap->in_lbn=%jx ap->in_off=%d\n", __func__, metalbn, off)); ap++; - for (++numlevels; i <= NIADDR; i++) { + for (++numlevels; i <= NANDFS_NIADDR; i++) { /* If searching for a meta-data block, quit when found. */ if (metalbn == realbn) break; blockcnt /= MNINDIR(fsdev); off = (bn / blockcnt) % MNINDIR(fsdev); ++numlevels; ap->in_lbn = metalbn; ap->in_off = off; DPRINTF(BMAP, ("%s: in_lbn=%jx in_off=%d\n", __func__, ap->in_lbn, ap->in_off)); ++ap; metalbn -= -1 + off * blockcnt; } if (nump) *nump = numlevels; DPRINTF(BMAP, ("%s: numlevels=%d\n", __func__, numlevels)); return (0); } Index: head/sys/fs/nandfs/nandfs_fs.h =================================================================== --- head/sys/fs/nandfs/nandfs_fs.h (revision 313779) +++ head/sys/fs/nandfs/nandfs_fs.h (revision 313780) @@ -1,565 +1,565 @@ /*- * Copyright (c) 2010-2012 Semihalf * Copyright (c) 2008, 2009 Reinoud Zandijk * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * Original definitions written by Koji Sato * and Ryusuke Konishi * From: NetBSD: nandfs_fs.h,v 1.1 2009/07/18 16:31:42 reinoud * * $FreeBSD$ */ #ifndef _NANDFS_FS_H #define _NANDFS_FS_H #include #define MNINDIR(fsdev) ((fsdev)->nd_blocksize / sizeof(nandfs_daddr_t)) /* * Inode structure. There are a few dedicated inode numbers that are * defined here first. */ #define NANDFS_WHT_INO 1 /* Whiteout ino */ #define NANDFS_ROOT_INO 2 /* Root file inode */ #define NANDFS_DAT_INO 3 /* DAT file */ #define NANDFS_CPFILE_INO 4 /* checkpoint file */ #define NANDFS_SUFILE_INO 5 /* segment usage file */ #define NANDFS_IFILE_INO 6 /* ifile */ #define NANDFS_GC_INO 7 /* Cleanerd node */ #define NANDFS_ATIME_INO 8 /* Atime file (reserved) */ #define NANDFS_XATTR_INO 9 /* Xattribute file (reserved) */ #define NANDFS_SKETCH_INO 10 /* Sketch file (obsolete) */ #define NANDFS_USER_INO 11 /* First user's file inode number */ #define NANDFS_SYS_NODE(ino) \ (((ino) >= NANDFS_DAT_INO) && ((ino) <= NANDFS_GC_INO)) -#define NDADDR 12 /* Direct addresses in inode. */ -#define NIADDR 3 /* Indirect addresses in inode. */ +#define NANDFS_NDADDR 12 /* Direct addresses in inode. */ +#define NANDFS_NIADDR 3 /* Indirect addresses in inode. */ typedef int64_t nandfs_daddr_t; typedef int64_t nandfs_lbn_t; struct nandfs_inode { uint64_t i_blocks; /* 0: size in device blocks */ uint64_t i_size; /* 8: size in bytes */ uint64_t i_ctime; /* 16: creation time in seconds */ uint64_t i_mtime; /* 24: modification time in seconds part*/ uint32_t i_ctime_nsec; /* 32: creation time nanoseconds part */ uint32_t i_mtime_nsec; /* 36: modification time in nanoseconds */ uint32_t i_uid; /* 40: user id */ uint32_t i_gid; /* 44: group id */ uint16_t i_mode; /* 48: file mode */ uint16_t i_links_count; /* 50: number of references to the inode*/ uint32_t i_flags; /* 52: NANDFS_*_FL flags */ nandfs_daddr_t i_special; /* 56: special */ - nandfs_daddr_t i_db[NDADDR]; /* 64: Direct disk blocks. */ - nandfs_daddr_t i_ib[NIADDR]; /* 160: Indirect disk blocks. */ + nandfs_daddr_t i_db[NANDFS_NDADDR]; /* 64: Direct disk blocks. */ + nandfs_daddr_t i_ib[NANDFS_NIADDR]; /* 160: Indirect disk blocks. */ uint64_t i_xattr; /* 184: reserved for extended attributes*/ uint32_t i_generation; /* 192: file generation for NFS */ uint32_t i_pad[15]; /* 196: make it 64 bits aligned */ }; #ifdef _KERNEL CTASSERT(sizeof(struct nandfs_inode) == 256); #endif /* * Each checkpoint/snapshot has a super root. * * The super root holds the inodes of the three system files: `dat', `cp' and * 'su' files. All other FS state is defined by those. * * It is CRC checksum'ed and time stamped. */ struct nandfs_super_root { uint32_t sr_sum; /* check-sum */ uint16_t sr_bytes; /* byte count of this structure */ uint16_t sr_flags; /* reserved for flags */ uint64_t sr_nongc_ctime; /* timestamp, not for cleaner(?) */ struct nandfs_inode sr_dat; /* DAT, virt->phys translation inode */ struct nandfs_inode sr_cpfile; /* CP, checkpoints inode */ struct nandfs_inode sr_sufile; /* SU, segment usage inode */ }; #define NANDFS_SR_MDT_OFFSET(inode_size, i) \ ((uint32_t)&((struct nandfs_super_root *)0)->sr_dat + \ (inode_size) * (i)) #define NANDFS_SR_DAT_OFFSET(inode_size) NANDFS_SR_MDT_OFFSET(inode_size, 0) #define NANDFS_SR_CPFILE_OFFSET(inode_size) NANDFS_SR_MDT_OFFSET(inode_size, 1) #define NANDFS_SR_SUFILE_OFFSET(inode_size) NANDFS_SR_MDT_OFFSET(inode_size, 2) #define NANDFS_SR_BYTES (sizeof(struct nandfs_super_root)) /* * The superblock describes the basic structure and mount history. It also * records some sizes of structures found on the disc for sanity checks. * * The superblock is stored at two places: NANDFS_SB_OFFSET_BYTES and * NANDFS_SB2_OFFSET_BYTES. */ /* File system states stored on media in superblock's sbp->s_state */ #define NANDFS_VALID_FS 0x0001 /* cleanly unmounted and all is ok */ #define NANDFS_ERROR_FS 0x0002 /* there were errors detected, fsck */ #define NANDFS_RESIZE_FS 0x0004 /* resize required, XXX unknown flag*/ #define NANDFS_MOUNT_STATE_BITS "\20\1VALID_FS\2ERROR_FS\3RESIZE_FS" /* * Brief description of control structures: * * NANDFS_NFSAREAS first blocks contain fsdata and some amount of super blocks. * Simple round-robin policy is used in order to choose which block will * contain new super block. * * Simple case with 2 blocks: * 1: fsdata sblock1 [sblock3 [sblock5 ..]] * 2: fsdata sblock2 [sblock4 [sblock6 ..]] */ struct nandfs_fsdata { uint16_t f_magic; uint16_t f_bytes; uint32_t f_sum; /* checksum of fsdata */ uint32_t f_rev_level; /* major disk format revision */ uint64_t f_ctime; /* creation time (execution time of newfs) */ /* Block size represented as: blocksize = 1 << (f_log_block_size + 10) */ uint32_t f_log_block_size; uint16_t f_inode_size; /* size of an inode */ uint16_t f_dat_entry_size; /* size of a dat entry */ uint16_t f_checkpoint_size; /* size of a checkpoint */ uint16_t f_segment_usage_size; /* size of a segment usage */ uint16_t f_sbbytes; /* byte count of CRC calculation for super blocks. s_reserved is excluded! */ uint16_t f_errors; /* behaviour on detecting errors */ uint32_t f_erasesize; uint64_t f_nsegments; /* number of segm. in filesystem */ nandfs_daddr_t f_first_data_block; /* 1st seg disk block number */ uint32_t f_blocks_per_segment; /* number of blocks per segment */ uint32_t f_r_segments_percentage; /* reserved segments percentage */ struct uuid f_uuid; /* 128-bit uuid for volume */ char f_volume_name[16]; /* volume name */ uint32_t f_pad[104]; } __packed; #ifdef _KERNEL CTASSERT(sizeof(struct nandfs_fsdata) == 512); #endif struct nandfs_super_block { uint16_t s_magic; /* magic value for identification */ uint32_t s_sum; /* check sum of super block */ uint64_t s_last_cno; /* last checkpoint number */ uint64_t s_last_pseg; /* addr part. segm. written last */ uint64_t s_last_seq; /* seq.number of seg written last */ uint64_t s_free_blocks_count; /* free blocks count */ uint64_t s_mtime; /* mount time */ uint64_t s_wtime; /* write time */ uint16_t s_state; /* file system state */ char s_last_mounted[64]; /* directory where last mounted */ uint32_t s_c_interval; /* commit interval of segment */ uint32_t s_c_block_max; /* threshold of data amount for the segment construction */ uint32_t s_reserved[32]; /* padding to end of the block */ } __packed; #ifdef _KERNEL CTASSERT(sizeof(struct nandfs_super_block) == 256); #endif #define NANDFS_FSDATA_MAGIC 0xf8da #define NANDFS_SUPER_MAGIC 0x8008 #define NANDFS_NFSAREAS 4 #define NANDFS_DATA_OFFSET_BYTES(esize) (NANDFS_NFSAREAS * (esize)) #define NANDFS_SBLOCK_OFFSET_BYTES (sizeof(struct nandfs_fsdata)) #define NANDFS_DEF_BLOCKSIZE 4096 #define NANDFS_MIN_BLOCKSIZE 512 #define NANDFS_DEF_ERASESIZE (2 << 16) #define NANDFS_MIN_SEGSIZE NANDFS_DEF_ERASESIZE #define NANDFS_CURRENT_REV 9 /* current major revision */ #define NANDFS_FSDATA_CRC_BYTES offsetof(struct nandfs_fsdata, f_pad) /* Bytes count of super_block for CRC-calculation */ #define NANDFS_SB_BYTES offsetof(struct nandfs_super_block, s_reserved) /* Maximal count of links to a file */ #define NANDFS_LINK_MAX 32000 /* * Structure of a directory entry. * * Note that they can't span blocks; the rec_len fills out. */ #define NANDFS_NAME_LEN 255 struct nandfs_dir_entry { uint64_t inode; /* inode number */ uint16_t rec_len; /* directory entry length */ uint8_t name_len; /* name length */ uint8_t file_type; char name[NANDFS_NAME_LEN]; /* file name */ char pad; }; /* * NANDFS_DIR_PAD defines the directory entries boundaries * * NOTE: It must be a multiple of 8 */ #define NANDFS_DIR_PAD 8 #define NANDFS_DIR_ROUND (NANDFS_DIR_PAD - 1) #define NANDFS_DIR_NAME_OFFSET (offsetof(struct nandfs_dir_entry, name)) #define NANDFS_DIR_REC_LEN(name_len) \ (((name_len) + NANDFS_DIR_NAME_OFFSET + NANDFS_DIR_ROUND) \ & ~NANDFS_DIR_ROUND) #define NANDFS_DIR_NAME_LEN(name_len) \ (NANDFS_DIR_REC_LEN(name_len) - NANDFS_DIR_NAME_OFFSET) /* * NiLFS/NANDFS devides the disc into fixed length segments. Each segment is * filled with one or more partial segments of variable lengths. * * Each partial segment has a segment summary header followed by updates of * files and optionally a super root. */ /* * Virtual to physical block translation information. For data blocks it maps * logical block number bi_blkoff to virtual block nr bi_vblocknr. For non * datablocks it is the virtual block number assigned to an indirect block * and has no bi_blkoff. The physical block number is the next * available data block in the partial segment after all the binfo's. */ struct nandfs_binfo_v { uint64_t bi_ino; /* file's inode */ uint64_t bi_vblocknr; /* assigned virtual block number */ uint64_t bi_blkoff; /* for file's logical block number */ }; /* * DAT allocation. For data blocks just the logical block number that maps on * the next available data block in the partial segment after the binfo's. */ struct nandfs_binfo_dat { uint64_t bi_ino; uint64_t bi_blkoff; /* DAT file's logical block number */ uint8_t bi_level; /* whether this is meta block */ uint8_t bi_pad[7]; }; #ifdef _KERNEL CTASSERT(sizeof(struct nandfs_binfo_v) == sizeof(struct nandfs_binfo_dat)); #endif /* Convenience union for both types of binfo's */ union nandfs_binfo { struct nandfs_binfo_v bi_v; struct nandfs_binfo_dat bi_dat; }; /* Indirect buffers path */ struct nandfs_indir { nandfs_daddr_t in_lbn; int in_off; }; /* The (partial) segment summary */ struct nandfs_segment_summary { uint32_t ss_datasum; /* CRC of complete data block */ uint32_t ss_sumsum; /* CRC of segment summary only */ uint32_t ss_magic; /* magic to identify segment summary */ uint16_t ss_bytes; /* size of segment summary structure */ uint16_t ss_flags; /* NANDFS_SS_* flags */ uint64_t ss_seq; /* sequence number of this segm. sum */ uint64_t ss_create; /* creation timestamp in seconds */ uint64_t ss_next; /* blocknumber of next segment */ uint32_t ss_nblocks; /* number of blocks used by summary */ uint32_t ss_nbinfos; /* number of binfo structures */ uint32_t ss_sumbytes; /* total size of segment summary */ uint32_t ss_pad; /* stream of binfo structures */ }; #define NANDFS_SEGSUM_MAGIC 0x8e680011 /* segment summary magic number */ /* Segment summary flags */ #define NANDFS_SS_LOGBGN 0x0001 /* begins a logical segment */ #define NANDFS_SS_LOGEND 0x0002 /* ends a logical segment */ #define NANDFS_SS_SR 0x0004 /* has super root */ #define NANDFS_SS_SYNDT 0x0008 /* includes data only updates */ #define NANDFS_SS_GC 0x0010 /* segment written for cleaner operation */ #define NANDFS_SS_FLAG_BITS "\20\1LOGBGN\2LOGEND\3SR\4SYNDT\5GC" /* Segment summary constrains */ #define NANDFS_SEG_MIN_BLOCKS 16 /* minimum number of blocks in a full segment */ #define NANDFS_PSEG_MIN_BLOCKS 2 /* minimum number of blocks in a partial segment */ #define NANDFS_MIN_NRSVSEGS 8 /* minimum number of reserved segments */ /* * Structure of DAT/inode file. * * A DAT file is divided into groups. The maximum number of groups is the * number of block group descriptors that fit into one block; this descriptor * only gives the number of free entries in the associated group. * * Each group has a block sized bitmap indicating if an entry is taken or * empty. Each bit stands for a DAT entry. * * The inode file has exactly the same format only the entries are inode * entries. */ struct nandfs_block_group_desc { uint32_t bg_nfrees; /* num. free entries in block group */ }; /* DAT entry in a super root's DAT file */ struct nandfs_dat_entry { uint64_t de_blocknr; /* block number */ uint64_t de_start; /* valid from checkpoint */ uint64_t de_end; /* valid till checkpoint */ uint64_t de_rsv; /* reserved for future use */ }; /* * Structure of CP file. * * A snapshot is just a checkpoint only it's protected against removal by the * cleaner. The snapshots are kept on a double linked list of checkpoints. */ struct nandfs_snapshot_list { uint64_t ssl_next; /* checkpoint nr. forward */ uint64_t ssl_prev; /* checkpoint nr. back */ }; /* Checkpoint entry structure */ struct nandfs_checkpoint { uint32_t cp_flags; /* NANDFS_CHECKPOINT_* flags */ uint32_t cp_checkpoints_count; /* ZERO, not used anymore? */ struct nandfs_snapshot_list cp_snapshot_list; /* list of snapshots */ uint64_t cp_cno; /* checkpoint number */ uint64_t cp_create; /* creation timestamp */ uint64_t cp_nblk_inc; /* number of blocks incremented */ uint64_t cp_blocks_count; /* reserved (might be deleted) */ struct nandfs_inode cp_ifile_inode; /* inode file inode */ }; /* Checkpoint flags */ #define NANDFS_CHECKPOINT_SNAPSHOT 1 #define NANDFS_CHECKPOINT_INVALID 2 #define NANDFS_CHECKPOINT_SKETCH 4 #define NANDFS_CHECKPOINT_MINOR 8 #define NANDFS_CHECKPOINT_BITS "\20\1SNAPSHOT\2INVALID\3SKETCH\4MINOR" /* Header of the checkpoint file */ struct nandfs_cpfile_header { uint64_t ch_ncheckpoints; /* number of checkpoints */ uint64_t ch_nsnapshots; /* number of snapshots */ struct nandfs_snapshot_list ch_snapshot_list; /* snapshot list */ }; #define NANDFS_CPFILE_FIRST_CHECKPOINT_OFFSET \ ((sizeof(struct nandfs_cpfile_header) + \ sizeof(struct nandfs_checkpoint) - 1) / \ sizeof(struct nandfs_checkpoint)) #define NANDFS_NOSEGMENT 0xffffffff /* * Structure of SU file. * * The segment usage file sums up how each of the segments are used. They are * indexed by their segment number. */ /* Segment usage entry */ struct nandfs_segment_usage { uint64_t su_lastmod; /* last modified timestamp */ uint32_t su_nblocks; /* number of blocks in segment */ uint32_t su_flags; /* NANDFS_SEGMENT_USAGE_* flags */ }; /* Segment usage flag */ #define NANDFS_SEGMENT_USAGE_ACTIVE 1 #define NANDFS_SEGMENT_USAGE_DIRTY 2 #define NANDFS_SEGMENT_USAGE_ERROR 4 #define NANDFS_SEGMENT_USAGE_GC 8 #define NANDFS_SEGMENT_USAGE_BITS "\20\1ACTIVE\2DIRTY\3ERROR" /* Header of the segment usage file */ struct nandfs_sufile_header { uint64_t sh_ncleansegs; /* number of segments marked clean */ uint64_t sh_ndirtysegs; /* number of segments marked dirty */ uint64_t sh_last_alloc; /* last allocated segment number */ }; #define NANDFS_SUFILE_FIRST_SEGMENT_USAGE_OFFSET \ ((sizeof(struct nandfs_sufile_header) + \ sizeof(struct nandfs_segment_usage) - 1) / \ sizeof(struct nandfs_segment_usage)) struct nandfs_seg_stat { uint64_t nss_nsegs; uint64_t nss_ncleansegs; uint64_t nss_ndirtysegs; uint64_t nss_ctime; uint64_t nss_nongc_ctime; uint64_t nss_prot_seq; }; enum { NANDFS_CHECKPOINT, NANDFS_SNAPSHOT }; #define NANDFS_CPINFO_MAX 512 struct nandfs_cpinfo { uint32_t nci_flags; uint32_t nci_pad; uint64_t nci_cno; uint64_t nci_create; uint64_t nci_nblk_inc; uint64_t nci_blocks_count; uint64_t nci_next; }; #define NANDFS_SEGMENTS_MAX 512 struct nandfs_suinfo { uint64_t nsi_num; uint64_t nsi_lastmod; uint32_t nsi_blocks; uint32_t nsi_flags; }; #define NANDFS_VINFO_MAX 512 struct nandfs_vinfo { uint64_t nvi_ino; uint64_t nvi_vblocknr; uint64_t nvi_start; uint64_t nvi_end; uint64_t nvi_blocknr; int nvi_alive; }; struct nandfs_cpmode { uint64_t ncpm_cno; uint32_t ncpm_mode; uint32_t ncpm_pad; }; struct nandfs_argv { uint64_t nv_base; uint32_t nv_nmembs; uint16_t nv_size; uint16_t nv_flags; uint64_t nv_index; }; struct nandfs_cpstat { uint64_t ncp_cno; uint64_t ncp_ncps; uint64_t ncp_nss; }; struct nandfs_period { uint64_t p_start; uint64_t p_end; }; struct nandfs_vdesc { uint64_t vd_ino; uint64_t vd_cno; uint64_t vd_vblocknr; struct nandfs_period vd_period; uint64_t vd_blocknr; uint64_t vd_offset; uint32_t vd_flags; uint32_t vd_pad; }; struct nandfs_bdesc { uint64_t bd_ino; uint64_t bd_oblocknr; uint64_t bd_blocknr; uint64_t bd_offset; uint32_t bd_level; uint32_t bd_alive; }; #ifndef _KERNEL #ifndef MNAMELEN #define MNAMELEN 88 #endif #endif struct nandfs_fsinfo { struct nandfs_fsdata fs_fsdata; struct nandfs_super_block fs_super; char fs_dev[MNAMELEN]; }; #define NANDFS_MAX_MOUNTS 65535 #define NANDFS_IOCTL_GET_SUSTAT _IOR('N', 100, struct nandfs_seg_stat) #define NANDFS_IOCTL_CHANGE_CPMODE _IOWR('N', 101, struct nandfs_cpmode) #define NANDFS_IOCTL_GET_CPINFO _IOWR('N', 102, struct nandfs_argv) #define NANDFS_IOCTL_DELETE_CP _IOWR('N', 103, uint64_t[2]) #define NANDFS_IOCTL_GET_CPSTAT _IOR('N', 104, struct nandfs_cpstat) #define NANDFS_IOCTL_GET_SUINFO _IOWR('N', 105, struct nandfs_argv) #define NANDFS_IOCTL_GET_VINFO _IOWR('N', 106, struct nandfs_argv) #define NANDFS_IOCTL_GET_BDESCS _IOWR('N', 107, struct nandfs_argv) #define NANDFS_IOCTL_GET_FSINFO _IOR('N', 108, struct nandfs_fsinfo) #define NANDFS_IOCTL_MAKE_SNAP _IOWR('N', 109, uint64_t) #define NANDFS_IOCTL_DELETE_SNAP _IOWR('N', 110, uint64_t) #define NANDFS_IOCTL_SYNC _IOWR('N', 111, uint64_t) #endif /* _NANDFS_FS_H */ Index: head/sys/fs/nfsclient/nfs_clvfsops.c =================================================================== --- head/sys/fs/nfsclient/nfs_clvfsops.c (revision 313779) +++ head/sys/fs/nfsclient/nfs_clvfsops.c (revision 313780) @@ -1,2003 +1,2003 @@ /*- * Copyright (c) 1989, 1993, 1995 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * Rick Macklem at The University of Guelph. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from nfs_vfsops.c 8.12 (Berkeley) 5/20/95 */ #include __FBSDID("$FreeBSD$"); #include "opt_bootp.h" #include "opt_nfsroot.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include FEATURE(nfscl, "NFSv4 client"); extern int nfscl_ticks; extern struct timeval nfsboottime; extern int nfsrv_useacl; extern int nfscl_debuglevel; extern enum nfsiod_state ncl_iodwant[NFS_MAXASYNCDAEMON]; extern struct nfsmount *ncl_iodmount[NFS_MAXASYNCDAEMON]; extern struct mtx ncl_iod_mutex; NFSCLSTATEMUTEX; MALLOC_DEFINE(M_NEWNFSREQ, "newnfsclient_req", "NFS request header"); MALLOC_DEFINE(M_NEWNFSMNT, "newnfsmnt", "NFS mount struct"); SYSCTL_DECL(_vfs_nfs); static int nfs_ip_paranoia = 1; SYSCTL_INT(_vfs_nfs, OID_AUTO, nfs_ip_paranoia, CTLFLAG_RW, &nfs_ip_paranoia, 0, ""); static int nfs_tprintf_initial_delay = NFS_TPRINTF_INITIAL_DELAY; SYSCTL_INT(_vfs_nfs, NFS_TPRINTF_INITIAL_DELAY, downdelayinitial, CTLFLAG_RW, &nfs_tprintf_initial_delay, 0, ""); /* how long between console messages "nfs server foo not responding" */ static int nfs_tprintf_delay = NFS_TPRINTF_DELAY; SYSCTL_INT(_vfs_nfs, NFS_TPRINTF_DELAY, downdelayinterval, CTLFLAG_RW, &nfs_tprintf_delay, 0, ""); #ifdef NFS_DEBUG int nfs_debug; SYSCTL_INT(_vfs_nfs, OID_AUTO, debug, CTLFLAG_RW, &nfs_debug, 0, "Toggle debug flag"); #endif static int nfs_mountroot(struct mount *); static void nfs_sec_name(char *, int *); static void nfs_decode_args(struct mount *mp, struct nfsmount *nmp, struct nfs_args *argp, const char *, struct ucred *, struct thread *); static int mountnfs(struct nfs_args *, struct mount *, struct sockaddr *, char *, u_char *, int, u_char *, int, u_char *, int, struct vnode **, struct ucred *, struct thread *, int, int, int); static void nfs_getnlminfo(struct vnode *, uint8_t *, size_t *, struct sockaddr_storage *, int *, off_t *, struct timeval *); static vfs_mount_t nfs_mount; static vfs_cmount_t nfs_cmount; static vfs_unmount_t nfs_unmount; static vfs_root_t nfs_root; static vfs_statfs_t nfs_statfs; static vfs_sync_t nfs_sync; static vfs_sysctl_t nfs_sysctl; static vfs_purge_t nfs_purge; /* * nfs vfs operations. */ static struct vfsops nfs_vfsops = { .vfs_init = ncl_init, .vfs_mount = nfs_mount, .vfs_cmount = nfs_cmount, .vfs_root = nfs_root, .vfs_statfs = nfs_statfs, .vfs_sync = nfs_sync, .vfs_uninit = ncl_uninit, .vfs_unmount = nfs_unmount, .vfs_sysctl = nfs_sysctl, .vfs_purge = nfs_purge, }; VFS_SET(nfs_vfsops, nfs, VFCF_NETWORK | VFCF_SBDRY); /* So that loader and kldload(2) can find us, wherever we are.. */ MODULE_VERSION(nfs, 1); MODULE_DEPEND(nfs, nfscommon, 1, 1, 1); MODULE_DEPEND(nfs, krpc, 1, 1, 1); MODULE_DEPEND(nfs, nfssvc, 1, 1, 1); MODULE_DEPEND(nfs, nfslock, 1, 1, 1); /* * This structure is now defined in sys/nfs/nfs_diskless.c so that it * can be shared by both NFS clients. It is declared here so that it * will be defined for kernels built without NFS_ROOT, although it * isn't used in that case. */ #if !defined(NFS_ROOT) struct nfs_diskless nfs_diskless = { { { 0 } } }; struct nfsv3_diskless nfsv3_diskless = { { { 0 } } }; int nfs_diskless_valid = 0; #endif SYSCTL_INT(_vfs_nfs, OID_AUTO, diskless_valid, CTLFLAG_RD, &nfs_diskless_valid, 0, "Has the diskless struct been filled correctly"); SYSCTL_STRING(_vfs_nfs, OID_AUTO, diskless_rootpath, CTLFLAG_RD, nfsv3_diskless.root_hostnam, 0, "Path to nfs root"); SYSCTL_OPAQUE(_vfs_nfs, OID_AUTO, diskless_rootaddr, CTLFLAG_RD, &nfsv3_diskless.root_saddr, sizeof(nfsv3_diskless.root_saddr), "%Ssockaddr_in", "Diskless root nfs address"); void newnfsargs_ntoh(struct nfs_args *); static int nfs_mountdiskless(char *, struct sockaddr_in *, struct nfs_args *, struct thread *, struct vnode **, struct mount *); static void nfs_convert_diskless(void); static void nfs_convert_oargs(struct nfs_args *args, struct onfs_args *oargs); int newnfs_iosize(struct nfsmount *nmp) { int iosize, maxio; /* First, set the upper limit for iosize */ if (nmp->nm_flag & NFSMNT_NFSV4) { maxio = NFS_MAXBSIZE; } else if (nmp->nm_flag & NFSMNT_NFSV3) { if (nmp->nm_sotype == SOCK_DGRAM) maxio = NFS_MAXDGRAMDATA; else maxio = NFS_MAXBSIZE; } else { maxio = NFS_V2MAXDATA; } if (nmp->nm_rsize > maxio || nmp->nm_rsize == 0) nmp->nm_rsize = maxio; if (nmp->nm_rsize > NFS_MAXBSIZE) nmp->nm_rsize = NFS_MAXBSIZE; if (nmp->nm_readdirsize > maxio || nmp->nm_readdirsize == 0) nmp->nm_readdirsize = maxio; if (nmp->nm_readdirsize > nmp->nm_rsize) nmp->nm_readdirsize = nmp->nm_rsize; if (nmp->nm_wsize > maxio || nmp->nm_wsize == 0) nmp->nm_wsize = maxio; if (nmp->nm_wsize > NFS_MAXBSIZE) nmp->nm_wsize = NFS_MAXBSIZE; /* * Calculate the size used for io buffers. Use the larger * of the two sizes to minimise nfs requests but make sure * that it is at least one VM page to avoid wasting buffer * space. It must also be at least NFS_DIRBLKSIZ, since * that is the buffer size used for directories. */ iosize = imax(nmp->nm_rsize, nmp->nm_wsize); iosize = imax(iosize, PAGE_SIZE); iosize = imax(iosize, NFS_DIRBLKSIZ); nmp->nm_mountp->mnt_stat.f_iosize = iosize; return (iosize); } static void nfs_convert_oargs(struct nfs_args *args, struct onfs_args *oargs) { args->version = NFS_ARGSVERSION; args->addr = oargs->addr; args->addrlen = oargs->addrlen; args->sotype = oargs->sotype; args->proto = oargs->proto; args->fh = oargs->fh; args->fhsize = oargs->fhsize; args->flags = oargs->flags; args->wsize = oargs->wsize; args->rsize = oargs->rsize; args->readdirsize = oargs->readdirsize; args->timeo = oargs->timeo; args->retrans = oargs->retrans; args->readahead = oargs->readahead; args->hostname = oargs->hostname; } static void nfs_convert_diskless(void) { bcopy(&nfs_diskless.myif, &nfsv3_diskless.myif, sizeof(struct ifaliasreq)); bcopy(&nfs_diskless.mygateway, &nfsv3_diskless.mygateway, sizeof(struct sockaddr_in)); nfs_convert_oargs(&nfsv3_diskless.root_args,&nfs_diskless.root_args); if (nfsv3_diskless.root_args.flags & NFSMNT_NFSV3) { nfsv3_diskless.root_fhsize = NFSX_MYFH; bcopy(nfs_diskless.root_fh, nfsv3_diskless.root_fh, NFSX_MYFH); } else { nfsv3_diskless.root_fhsize = NFSX_V2FH; bcopy(nfs_diskless.root_fh, nfsv3_diskless.root_fh, NFSX_V2FH); } bcopy(&nfs_diskless.root_saddr,&nfsv3_diskless.root_saddr, sizeof(struct sockaddr_in)); bcopy(nfs_diskless.root_hostnam, nfsv3_diskless.root_hostnam, MNAMELEN); nfsv3_diskless.root_time = nfs_diskless.root_time; bcopy(nfs_diskless.my_hostnam, nfsv3_diskless.my_hostnam, MAXHOSTNAMELEN); nfs_diskless_valid = 3; } /* * nfs statfs call */ static int nfs_statfs(struct mount *mp, struct statfs *sbp) { struct vnode *vp; struct thread *td; struct nfsmount *nmp = VFSTONFS(mp); struct nfsvattr nfsva; struct nfsfsinfo fs; struct nfsstatfs sb; int error = 0, attrflag, gotfsinfo = 0, ret; struct nfsnode *np; td = curthread; error = vfs_busy(mp, MBF_NOWAIT); if (error) return (error); error = ncl_nget(mp, nmp->nm_fh, nmp->nm_fhsize, &np, LK_EXCLUSIVE); if (error) { vfs_unbusy(mp); return (error); } vp = NFSTOV(np); mtx_lock(&nmp->nm_mtx); if (NFSHASNFSV3(nmp) && !NFSHASGOTFSINFO(nmp)) { mtx_unlock(&nmp->nm_mtx); error = nfsrpc_fsinfo(vp, &fs, td->td_ucred, td, &nfsva, &attrflag, NULL); if (!error) gotfsinfo = 1; } else mtx_unlock(&nmp->nm_mtx); if (!error) error = nfsrpc_statfs(vp, &sb, &fs, td->td_ucred, td, &nfsva, &attrflag, NULL); if (error != 0) NFSCL_DEBUG(2, "statfs=%d\n", error); if (attrflag == 0) { ret = nfsrpc_getattrnovp(nmp, nmp->nm_fh, nmp->nm_fhsize, 1, td->td_ucred, td, &nfsva, NULL, NULL); if (ret) { /* * Just set default values to get things going. */ NFSBZERO((caddr_t)&nfsva, sizeof (struct nfsvattr)); nfsva.na_vattr.va_type = VDIR; nfsva.na_vattr.va_mode = 0777; nfsva.na_vattr.va_nlink = 100; nfsva.na_vattr.va_uid = (uid_t)0; nfsva.na_vattr.va_gid = (gid_t)0; nfsva.na_vattr.va_fileid = 2; nfsva.na_vattr.va_gen = 1; nfsva.na_vattr.va_blocksize = NFS_FABLKSIZE; nfsva.na_vattr.va_size = 512 * 1024; } } (void) nfscl_loadattrcache(&vp, &nfsva, NULL, NULL, 0, 1); if (!error) { mtx_lock(&nmp->nm_mtx); if (gotfsinfo || (nmp->nm_flag & NFSMNT_NFSV4)) nfscl_loadfsinfo(nmp, &fs); nfscl_loadsbinfo(nmp, &sb, sbp); sbp->f_iosize = newnfs_iosize(nmp); mtx_unlock(&nmp->nm_mtx); if (sbp != &mp->mnt_stat) { bcopy(mp->mnt_stat.f_mntonname, sbp->f_mntonname, MNAMELEN); bcopy(mp->mnt_stat.f_mntfromname, sbp->f_mntfromname, MNAMELEN); } strncpy(&sbp->f_fstypename[0], mp->mnt_vfc->vfc_name, MFSNAMELEN); } else if (NFS_ISV4(vp)) { error = nfscl_maperr(td, error, (uid_t)0, (gid_t)0); } vput(vp); vfs_unbusy(mp); return (error); } /* * nfs version 3 fsinfo rpc call */ int ncl_fsinfo(struct nfsmount *nmp, struct vnode *vp, struct ucred *cred, struct thread *td) { struct nfsfsinfo fs; struct nfsvattr nfsva; int error, attrflag; error = nfsrpc_fsinfo(vp, &fs, cred, td, &nfsva, &attrflag, NULL); if (!error) { if (attrflag) (void) nfscl_loadattrcache(&vp, &nfsva, NULL, NULL, 0, 1); mtx_lock(&nmp->nm_mtx); nfscl_loadfsinfo(nmp, &fs); mtx_unlock(&nmp->nm_mtx); } return (error); } /* * Mount a remote root fs via. nfs. This depends on the info in the * nfs_diskless structure that has been filled in properly by some primary * bootstrap. * It goes something like this: * - do enough of "ifconfig" by calling ifioctl() so that the system * can talk to the server * - If nfs_diskless.mygateway is filled in, use that address as * a default gateway. * - build the rootfs mount point and call mountnfs() to do the rest. * * It is assumed to be safe to read, modify, and write the nfsv3_diskless * structure, as well as other global NFS client variables here, as * nfs_mountroot() will be called once in the boot before any other NFS * client activity occurs. */ static int nfs_mountroot(struct mount *mp) { struct thread *td = curthread; struct nfsv3_diskless *nd = &nfsv3_diskless; struct socket *so; struct vnode *vp; struct ifreq ir; int error; u_long l; char buf[128]; char *cp; #if defined(BOOTP_NFSROOT) && defined(BOOTP) bootpc_init(); /* use bootp to get nfs_diskless filled in */ #elif defined(NFS_ROOT) nfs_setup_diskless(); #endif if (nfs_diskless_valid == 0) return (-1); if (nfs_diskless_valid == 1) nfs_convert_diskless(); /* * XXX splnet, so networks will receive... */ splnet(); /* * Do enough of ifconfig(8) so that the critical net interface can * talk to the server. */ error = socreate(nd->myif.ifra_addr.sa_family, &so, nd->root_args.sotype, 0, td->td_ucred, td); if (error) panic("nfs_mountroot: socreate(%04x): %d", nd->myif.ifra_addr.sa_family, error); #if 0 /* XXX Bad idea */ /* * We might not have been told the right interface, so we pass * over the first ten interfaces of the same kind, until we get * one of them configured. */ for (i = strlen(nd->myif.ifra_name) - 1; nd->myif.ifra_name[i] >= '0' && nd->myif.ifra_name[i] <= '9'; nd->myif.ifra_name[i] ++) { error = ifioctl(so, SIOCAIFADDR, (caddr_t)&nd->myif, td); if(!error) break; } #endif error = ifioctl(so, SIOCAIFADDR, (caddr_t)&nd->myif, td); if (error) panic("nfs_mountroot: SIOCAIFADDR: %d", error); if ((cp = kern_getenv("boot.netif.mtu")) != NULL) { ir.ifr_mtu = strtol(cp, NULL, 10); bcopy(nd->myif.ifra_name, ir.ifr_name, IFNAMSIZ); freeenv(cp); error = ifioctl(so, SIOCSIFMTU, (caddr_t)&ir, td); if (error) printf("nfs_mountroot: SIOCSIFMTU: %d", error); } soclose(so); /* * If the gateway field is filled in, set it as the default route. * Note that pxeboot will set a default route of 0 if the route * is not set by the DHCP server. Check also for a value of 0 * to avoid panicking inappropriately in that situation. */ if (nd->mygateway.sin_len != 0 && nd->mygateway.sin_addr.s_addr != 0) { struct sockaddr_in mask, sin; bzero((caddr_t)&mask, sizeof(mask)); sin = mask; sin.sin_family = AF_INET; sin.sin_len = sizeof(sin); /* XXX MRT use table 0 for this sort of thing */ CURVNET_SET(TD_TO_VNET(td)); error = rtrequest_fib(RTM_ADD, (struct sockaddr *)&sin, (struct sockaddr *)&nd->mygateway, (struct sockaddr *)&mask, RTF_UP | RTF_GATEWAY, NULL, RT_DEFAULT_FIB); CURVNET_RESTORE(); if (error) panic("nfs_mountroot: RTM_ADD: %d", error); } /* * Create the rootfs mount point. */ nd->root_args.fh = nd->root_fh; nd->root_args.fhsize = nd->root_fhsize; l = ntohl(nd->root_saddr.sin_addr.s_addr); snprintf(buf, sizeof(buf), "%ld.%ld.%ld.%ld:%s", (l >> 24) & 0xff, (l >> 16) & 0xff, (l >> 8) & 0xff, (l >> 0) & 0xff, nd->root_hostnam); printf("NFS ROOT: %s\n", buf); nd->root_args.hostname = buf; if ((error = nfs_mountdiskless(buf, &nd->root_saddr, &nd->root_args, td, &vp, mp)) != 0) { return (error); } /* * This is not really an nfs issue, but it is much easier to * set hostname here and then let the "/etc/rc.xxx" files * mount the right /var based upon its preset value. */ mtx_lock(&prison0.pr_mtx); strlcpy(prison0.pr_hostname, nd->my_hostnam, sizeof(prison0.pr_hostname)); mtx_unlock(&prison0.pr_mtx); inittodr(ntohl(nd->root_time)); return (0); } /* * Internal version of mount system call for diskless setup. */ static int nfs_mountdiskless(char *path, struct sockaddr_in *sin, struct nfs_args *args, struct thread *td, struct vnode **vpp, struct mount *mp) { struct sockaddr *nam; int dirlen, error; char *dirpath; /* * Find the directory path in "path", which also has the server's * name/ip address in it. */ dirpath = strchr(path, ':'); if (dirpath != NULL) dirlen = strlen(++dirpath); else dirlen = 0; nam = sodupsockaddr((struct sockaddr *)sin, M_WAITOK); if ((error = mountnfs(args, mp, nam, path, NULL, 0, dirpath, dirlen, NULL, 0, vpp, td->td_ucred, td, NFS_DEFAULT_NAMETIMEO, NFS_DEFAULT_NEGNAMETIMEO, 0)) != 0) { printf("nfs_mountroot: mount %s on /: %d\n", path, error); return (error); } return (0); } static void nfs_sec_name(char *sec, int *flagsp) { if (!strcmp(sec, "krb5")) *flagsp |= NFSMNT_KERB; else if (!strcmp(sec, "krb5i")) *flagsp |= (NFSMNT_KERB | NFSMNT_INTEGRITY); else if (!strcmp(sec, "krb5p")) *flagsp |= (NFSMNT_KERB | NFSMNT_PRIVACY); } static void nfs_decode_args(struct mount *mp, struct nfsmount *nmp, struct nfs_args *argp, const char *hostname, struct ucred *cred, struct thread *td) { int s; int adjsock; char *p; s = splnet(); /* * Set read-only flag if requested; otherwise, clear it if this is * an update. If this is not an update, then either the read-only * flag is already clear, or this is a root mount and it was set * intentionally at some previous point. */ if (vfs_getopt(mp->mnt_optnew, "ro", NULL, NULL) == 0) { MNT_ILOCK(mp); mp->mnt_flag |= MNT_RDONLY; MNT_IUNLOCK(mp); } else if (mp->mnt_flag & MNT_UPDATE) { MNT_ILOCK(mp); mp->mnt_flag &= ~MNT_RDONLY; MNT_IUNLOCK(mp); } /* * Silently clear NFSMNT_NOCONN if it's a TCP mount, it makes * no sense in that context. Also, set up appropriate retransmit * and soft timeout behavior. */ if (argp->sotype == SOCK_STREAM) { nmp->nm_flag &= ~NFSMNT_NOCONN; nmp->nm_timeo = NFS_MAXTIMEO; if ((argp->flags & NFSMNT_NFSV4) != 0) nmp->nm_retry = INT_MAX; else nmp->nm_retry = NFS_RETRANS_TCP; } /* Also clear RDIRPLUS if NFSv2, it crashes some servers */ if ((argp->flags & (NFSMNT_NFSV3 | NFSMNT_NFSV4)) == 0) { argp->flags &= ~NFSMNT_RDIRPLUS; nmp->nm_flag &= ~NFSMNT_RDIRPLUS; } /* Re-bind if rsrvd port requested and wasn't on one */ adjsock = !(nmp->nm_flag & NFSMNT_RESVPORT) && (argp->flags & NFSMNT_RESVPORT); /* Also re-bind if we're switching to/from a connected UDP socket */ adjsock |= ((nmp->nm_flag & NFSMNT_NOCONN) != (argp->flags & NFSMNT_NOCONN)); /* Update flags atomically. Don't change the lock bits. */ nmp->nm_flag = argp->flags | nmp->nm_flag; splx(s); if ((argp->flags & NFSMNT_TIMEO) && argp->timeo > 0) { nmp->nm_timeo = (argp->timeo * NFS_HZ + 5) / 10; if (nmp->nm_timeo < NFS_MINTIMEO) nmp->nm_timeo = NFS_MINTIMEO; else if (nmp->nm_timeo > NFS_MAXTIMEO) nmp->nm_timeo = NFS_MAXTIMEO; } if ((argp->flags & NFSMNT_RETRANS) && argp->retrans > 1) { nmp->nm_retry = argp->retrans; if (nmp->nm_retry > NFS_MAXREXMIT) nmp->nm_retry = NFS_MAXREXMIT; } if ((argp->flags & NFSMNT_WSIZE) && argp->wsize > 0) { nmp->nm_wsize = argp->wsize; /* * Clip at the power of 2 below the size. There is an * issue (not isolated) that causes intermittent page * faults if this is not done. */ if (nmp->nm_wsize > NFS_FABLKSIZE) nmp->nm_wsize = 1 << (fls(nmp->nm_wsize) - 1); else nmp->nm_wsize = NFS_FABLKSIZE; } if ((argp->flags & NFSMNT_RSIZE) && argp->rsize > 0) { nmp->nm_rsize = argp->rsize; /* * Clip at the power of 2 below the size. There is an * issue (not isolated) that causes intermittent page * faults if this is not done. */ if (nmp->nm_rsize > NFS_FABLKSIZE) nmp->nm_rsize = 1 << (fls(nmp->nm_rsize) - 1); else nmp->nm_rsize = NFS_FABLKSIZE; } if ((argp->flags & NFSMNT_READDIRSIZE) && argp->readdirsize > 0) { nmp->nm_readdirsize = argp->readdirsize; } if ((argp->flags & NFSMNT_ACREGMIN) && argp->acregmin >= 0) nmp->nm_acregmin = argp->acregmin; else nmp->nm_acregmin = NFS_MINATTRTIMO; if ((argp->flags & NFSMNT_ACREGMAX) && argp->acregmax >= 0) nmp->nm_acregmax = argp->acregmax; else nmp->nm_acregmax = NFS_MAXATTRTIMO; if ((argp->flags & NFSMNT_ACDIRMIN) && argp->acdirmin >= 0) nmp->nm_acdirmin = argp->acdirmin; else nmp->nm_acdirmin = NFS_MINDIRATTRTIMO; if ((argp->flags & NFSMNT_ACDIRMAX) && argp->acdirmax >= 0) nmp->nm_acdirmax = argp->acdirmax; else nmp->nm_acdirmax = NFS_MAXDIRATTRTIMO; if (nmp->nm_acdirmin > nmp->nm_acdirmax) nmp->nm_acdirmin = nmp->nm_acdirmax; if (nmp->nm_acregmin > nmp->nm_acregmax) nmp->nm_acregmin = nmp->nm_acregmax; if ((argp->flags & NFSMNT_READAHEAD) && argp->readahead >= 0) { if (argp->readahead <= NFS_MAXRAHEAD) nmp->nm_readahead = argp->readahead; else nmp->nm_readahead = NFS_MAXRAHEAD; } if ((argp->flags & NFSMNT_WCOMMITSIZE) && argp->wcommitsize >= 0) { if (argp->wcommitsize < nmp->nm_wsize) nmp->nm_wcommitsize = nmp->nm_wsize; else nmp->nm_wcommitsize = argp->wcommitsize; } adjsock |= ((nmp->nm_sotype != argp->sotype) || (nmp->nm_soproto != argp->proto)); if (nmp->nm_client != NULL && adjsock) { int haslock = 0, error = 0; if (nmp->nm_sotype == SOCK_STREAM) { error = newnfs_sndlock(&nmp->nm_sockreq.nr_lock); if (!error) haslock = 1; } if (!error) { newnfs_disconnect(&nmp->nm_sockreq); if (haslock) newnfs_sndunlock(&nmp->nm_sockreq.nr_lock); nmp->nm_sotype = argp->sotype; nmp->nm_soproto = argp->proto; if (nmp->nm_sotype == SOCK_DGRAM) while (newnfs_connect(nmp, &nmp->nm_sockreq, cred, td, 0)) { printf("newnfs_args: retrying connect\n"); (void) nfs_catnap(PSOCK, 0, "nfscon"); } } } else { nmp->nm_sotype = argp->sotype; nmp->nm_soproto = argp->proto; } if (hostname != NULL) { strlcpy(nmp->nm_hostname, hostname, sizeof(nmp->nm_hostname)); p = strchr(nmp->nm_hostname, ':'); if (p != NULL) *p = '\0'; } } static const char *nfs_opts[] = { "from", "nfs_args", "noac", "noatime", "noexec", "suiddir", "nosuid", "nosymfollow", "union", "noclusterr", "noclusterw", "multilabel", "acls", "force", "update", "async", "noconn", "nolockd", "conn", "lockd", "intr", "rdirplus", "readdirsize", "soft", "hard", "mntudp", "tcp", "udp", "wsize", "rsize", "retrans", "actimeo", "acregmin", "acregmax", "acdirmin", "acdirmax", "resvport", "readahead", "hostname", "timeo", "timeout", "addr", "fh", "nfsv3", "sec", "principal", "nfsv4", "gssname", "allgssname", "dirpath", "minorversion", "nametimeo", "negnametimeo", "nocto", "noncontigwr", "pnfs", "wcommitsize", NULL }; /* * Parse the "from" mountarg, passed by the generic mount(8) program * or the mountroot code. This is used when rerooting into NFS. * * Note that the "hostname" is actually a "hostname:/share/path" string. */ static int nfs_mount_parse_from(struct vfsoptlist *opts, char **hostnamep, struct sockaddr_in **sinp, char *dirpath, size_t dirpathsize, int *dirlenp) { char nam[MNAMELEN + 1]; char *delimp, *hostp, *spec; int error, have_bracket = 0, offset, rv, speclen; struct sockaddr_in *sin; size_t len; error = vfs_getopt(opts, "from", (void **)&spec, &speclen); if (error != 0) return (error); /* * This part comes from sbin/mount_nfs/mount_nfs.c:getnfsargs(). */ if (*spec == '[' && (delimp = strchr(spec + 1, ']')) != NULL && *(delimp + 1) == ':') { hostp = spec + 1; spec = delimp + 2; have_bracket = 1; } else if ((delimp = strrchr(spec, ':')) != NULL) { hostp = spec; spec = delimp + 1; } else if ((delimp = strrchr(spec, '@')) != NULL) { printf("%s: path@server syntax is deprecated, " "use server:path\n", __func__); hostp = delimp + 1; } else { printf("%s: no : nfs-name\n", __func__); return (EINVAL); } *delimp = '\0'; /* * If there has been a trailing slash at mounttime it seems * that some mountd implementations fail to remove the mount * entries from their mountlist while unmounting. */ for (speclen = strlen(spec); speclen > 1 && spec[speclen - 1] == '/'; speclen--) spec[speclen - 1] = '\0'; if (strlen(hostp) + strlen(spec) + 1 > MNAMELEN) { printf("%s: %s:%s: name too long", __func__, hostp, spec); return (EINVAL); } /* Make both '@' and ':' notations equal */ if (*hostp != '\0') { len = strlen(hostp); offset = 0; if (have_bracket) nam[offset++] = '['; memmove(nam + offset, hostp, len); if (have_bracket) nam[len + offset++] = ']'; nam[len + offset++] = ':'; memmove(nam + len + offset, spec, speclen); nam[len + speclen + offset] = '\0'; } else nam[0] = '\0'; /* * XXX: IPv6 */ sin = malloc(sizeof(*sin), M_SONAME, M_WAITOK); rv = inet_pton(AF_INET, hostp, &sin->sin_addr); if (rv != 1) { printf("%s: cannot parse '%s', inet_pton() returned %d\n", __func__, hostp, rv); free(sin, M_SONAME); return (EINVAL); } sin->sin_len = sizeof(*sin); sin->sin_family = AF_INET; /* * XXX: hardcoded port number. */ sin->sin_port = htons(2049); *hostnamep = strdup(nam, M_NEWNFSMNT); *sinp = sin; strlcpy(dirpath, spec, dirpathsize); *dirlenp = strlen(dirpath); return (0); } /* * VFS Operations. * * mount system call * It seems a bit dumb to copyinstr() the host and path here and then * bcopy() them in mountnfs(), but I wanted to detect errors before * doing the getsockaddr() call because getsockaddr() allocates an mbuf and * an error after that means that I have to release the mbuf. */ /* ARGSUSED */ static int nfs_mount(struct mount *mp) { struct nfs_args args = { .version = NFS_ARGSVERSION, .addr = NULL, .addrlen = sizeof (struct sockaddr_in), .sotype = SOCK_STREAM, .proto = 0, .fh = NULL, .fhsize = 0, .flags = NFSMNT_RESVPORT, .wsize = NFS_WSIZE, .rsize = NFS_RSIZE, .readdirsize = NFS_READDIRSIZE, .timeo = 10, .retrans = NFS_RETRANS, .readahead = NFS_DEFRAHEAD, .wcommitsize = 0, /* was: NQ_DEFLEASE */ .hostname = NULL, .acregmin = NFS_MINATTRTIMO, .acregmax = NFS_MAXATTRTIMO, .acdirmin = NFS_MINDIRATTRTIMO, .acdirmax = NFS_MAXDIRATTRTIMO, }; int error = 0, ret, len; struct sockaddr *nam = NULL; struct vnode *vp; struct thread *td; char hst[MNAMELEN]; u_char nfh[NFSX_FHMAX], krbname[100], dirpath[100], srvkrbname[100]; char *cp, *opt, *name, *secname; int nametimeo = NFS_DEFAULT_NAMETIMEO; int negnametimeo = NFS_DEFAULT_NEGNAMETIMEO; int minvers = 0; int dirlen, has_nfs_args_opt, has_nfs_from_opt, krbnamelen, srvkrbnamelen; size_t hstlen; has_nfs_args_opt = 0; has_nfs_from_opt = 0; if (vfs_filteropt(mp->mnt_optnew, nfs_opts)) { error = EINVAL; goto out; } td = curthread; if ((mp->mnt_flag & (MNT_ROOTFS | MNT_UPDATE)) == MNT_ROOTFS && nfs_diskless_valid != 0) { error = nfs_mountroot(mp); goto out; } nfscl_init(); /* * The old mount_nfs program passed the struct nfs_args * from userspace to kernel. The new mount_nfs program * passes string options via nmount() from userspace to kernel * and we populate the struct nfs_args in the kernel. */ if (vfs_getopt(mp->mnt_optnew, "nfs_args", NULL, NULL) == 0) { error = vfs_copyopt(mp->mnt_optnew, "nfs_args", &args, sizeof(args)); if (error != 0) goto out; if (args.version != NFS_ARGSVERSION) { error = EPROGMISMATCH; goto out; } has_nfs_args_opt = 1; } /* Handle the new style options. */ if (vfs_getopt(mp->mnt_optnew, "noac", NULL, NULL) == 0) { args.acdirmin = args.acdirmax = args.acregmin = args.acregmax = 0; args.flags |= NFSMNT_ACDIRMIN | NFSMNT_ACDIRMAX | NFSMNT_ACREGMIN | NFSMNT_ACREGMAX; } if (vfs_getopt(mp->mnt_optnew, "noconn", NULL, NULL) == 0) args.flags |= NFSMNT_NOCONN; if (vfs_getopt(mp->mnt_optnew, "conn", NULL, NULL) == 0) args.flags &= ~NFSMNT_NOCONN; if (vfs_getopt(mp->mnt_optnew, "nolockd", NULL, NULL) == 0) args.flags |= NFSMNT_NOLOCKD; if (vfs_getopt(mp->mnt_optnew, "lockd", NULL, NULL) == 0) args.flags &= ~NFSMNT_NOLOCKD; if (vfs_getopt(mp->mnt_optnew, "intr", NULL, NULL) == 0) args.flags |= NFSMNT_INT; if (vfs_getopt(mp->mnt_optnew, "rdirplus", NULL, NULL) == 0) args.flags |= NFSMNT_RDIRPLUS; if (vfs_getopt(mp->mnt_optnew, "resvport", NULL, NULL) == 0) args.flags |= NFSMNT_RESVPORT; if (vfs_getopt(mp->mnt_optnew, "noresvport", NULL, NULL) == 0) args.flags &= ~NFSMNT_RESVPORT; if (vfs_getopt(mp->mnt_optnew, "soft", NULL, NULL) == 0) args.flags |= NFSMNT_SOFT; if (vfs_getopt(mp->mnt_optnew, "hard", NULL, NULL) == 0) args.flags &= ~NFSMNT_SOFT; if (vfs_getopt(mp->mnt_optnew, "mntudp", NULL, NULL) == 0) args.sotype = SOCK_DGRAM; if (vfs_getopt(mp->mnt_optnew, "udp", NULL, NULL) == 0) args.sotype = SOCK_DGRAM; if (vfs_getopt(mp->mnt_optnew, "tcp", NULL, NULL) == 0) args.sotype = SOCK_STREAM; if (vfs_getopt(mp->mnt_optnew, "nfsv3", NULL, NULL) == 0) args.flags |= NFSMNT_NFSV3; if (vfs_getopt(mp->mnt_optnew, "nfsv4", NULL, NULL) == 0) { args.flags |= NFSMNT_NFSV4; args.sotype = SOCK_STREAM; } if (vfs_getopt(mp->mnt_optnew, "allgssname", NULL, NULL) == 0) args.flags |= NFSMNT_ALLGSSNAME; if (vfs_getopt(mp->mnt_optnew, "nocto", NULL, NULL) == 0) args.flags |= NFSMNT_NOCTO; if (vfs_getopt(mp->mnt_optnew, "noncontigwr", NULL, NULL) == 0) args.flags |= NFSMNT_NONCONTIGWR; if (vfs_getopt(mp->mnt_optnew, "pnfs", NULL, NULL) == 0) args.flags |= NFSMNT_PNFS; if (vfs_getopt(mp->mnt_optnew, "readdirsize", (void **)&opt, NULL) == 0) { if (opt == NULL) { vfs_mount_error(mp, "illegal readdirsize"); error = EINVAL; goto out; } ret = sscanf(opt, "%d", &args.readdirsize); if (ret != 1 || args.readdirsize <= 0) { vfs_mount_error(mp, "illegal readdirsize: %s", opt); error = EINVAL; goto out; } args.flags |= NFSMNT_READDIRSIZE; } if (vfs_getopt(mp->mnt_optnew, "readahead", (void **)&opt, NULL) == 0) { if (opt == NULL) { vfs_mount_error(mp, "illegal readahead"); error = EINVAL; goto out; } ret = sscanf(opt, "%d", &args.readahead); if (ret != 1 || args.readahead <= 0) { vfs_mount_error(mp, "illegal readahead: %s", opt); error = EINVAL; goto out; } args.flags |= NFSMNT_READAHEAD; } if (vfs_getopt(mp->mnt_optnew, "wsize", (void **)&opt, NULL) == 0) { if (opt == NULL) { vfs_mount_error(mp, "illegal wsize"); error = EINVAL; goto out; } ret = sscanf(opt, "%d", &args.wsize); if (ret != 1 || args.wsize <= 0) { vfs_mount_error(mp, "illegal wsize: %s", opt); error = EINVAL; goto out; } args.flags |= NFSMNT_WSIZE; } if (vfs_getopt(mp->mnt_optnew, "rsize", (void **)&opt, NULL) == 0) { if (opt == NULL) { vfs_mount_error(mp, "illegal rsize"); error = EINVAL; goto out; } ret = sscanf(opt, "%d", &args.rsize); if (ret != 1 || args.rsize <= 0) { vfs_mount_error(mp, "illegal wsize: %s", opt); error = EINVAL; goto out; } args.flags |= NFSMNT_RSIZE; } if (vfs_getopt(mp->mnt_optnew, "retrans", (void **)&opt, NULL) == 0) { if (opt == NULL) { vfs_mount_error(mp, "illegal retrans"); error = EINVAL; goto out; } ret = sscanf(opt, "%d", &args.retrans); if (ret != 1 || args.retrans <= 0) { vfs_mount_error(mp, "illegal retrans: %s", opt); error = EINVAL; goto out; } args.flags |= NFSMNT_RETRANS; } if (vfs_getopt(mp->mnt_optnew, "actimeo", (void **)&opt, NULL) == 0) { ret = sscanf(opt, "%d", &args.acregmin); if (ret != 1 || args.acregmin < 0) { vfs_mount_error(mp, "illegal actimeo: %s", opt); error = EINVAL; goto out; } args.acdirmin = args.acdirmax = args.acregmax = args.acregmin; args.flags |= NFSMNT_ACDIRMIN | NFSMNT_ACDIRMAX | NFSMNT_ACREGMIN | NFSMNT_ACREGMAX; } if (vfs_getopt(mp->mnt_optnew, "acregmin", (void **)&opt, NULL) == 0) { ret = sscanf(opt, "%d", &args.acregmin); if (ret != 1 || args.acregmin < 0) { vfs_mount_error(mp, "illegal acregmin: %s", opt); error = EINVAL; goto out; } args.flags |= NFSMNT_ACREGMIN; } if (vfs_getopt(mp->mnt_optnew, "acregmax", (void **)&opt, NULL) == 0) { ret = sscanf(opt, "%d", &args.acregmax); if (ret != 1 || args.acregmax < 0) { vfs_mount_error(mp, "illegal acregmax: %s", opt); error = EINVAL; goto out; } args.flags |= NFSMNT_ACREGMAX; } if (vfs_getopt(mp->mnt_optnew, "acdirmin", (void **)&opt, NULL) == 0) { ret = sscanf(opt, "%d", &args.acdirmin); if (ret != 1 || args.acdirmin < 0) { vfs_mount_error(mp, "illegal acdirmin: %s", opt); error = EINVAL; goto out; } args.flags |= NFSMNT_ACDIRMIN; } if (vfs_getopt(mp->mnt_optnew, "acdirmax", (void **)&opt, NULL) == 0) { ret = sscanf(opt, "%d", &args.acdirmax); if (ret != 1 || args.acdirmax < 0) { vfs_mount_error(mp, "illegal acdirmax: %s", opt); error = EINVAL; goto out; } args.flags |= NFSMNT_ACDIRMAX; } if (vfs_getopt(mp->mnt_optnew, "wcommitsize", (void **)&opt, NULL) == 0) { ret = sscanf(opt, "%d", &args.wcommitsize); if (ret != 1 || args.wcommitsize < 0) { vfs_mount_error(mp, "illegal wcommitsize: %s", opt); error = EINVAL; goto out; } args.flags |= NFSMNT_WCOMMITSIZE; } if (vfs_getopt(mp->mnt_optnew, "timeo", (void **)&opt, NULL) == 0) { ret = sscanf(opt, "%d", &args.timeo); if (ret != 1 || args.timeo <= 0) { vfs_mount_error(mp, "illegal timeo: %s", opt); error = EINVAL; goto out; } args.flags |= NFSMNT_TIMEO; } if (vfs_getopt(mp->mnt_optnew, "timeout", (void **)&opt, NULL) == 0) { ret = sscanf(opt, "%d", &args.timeo); if (ret != 1 || args.timeo <= 0) { vfs_mount_error(mp, "illegal timeout: %s", opt); error = EINVAL; goto out; } args.flags |= NFSMNT_TIMEO; } if (vfs_getopt(mp->mnt_optnew, "nametimeo", (void **)&opt, NULL) == 0) { ret = sscanf(opt, "%d", &nametimeo); if (ret != 1 || nametimeo < 0) { vfs_mount_error(mp, "illegal nametimeo: %s", opt); error = EINVAL; goto out; } } if (vfs_getopt(mp->mnt_optnew, "negnametimeo", (void **)&opt, NULL) == 0) { ret = sscanf(opt, "%d", &negnametimeo); if (ret != 1 || negnametimeo < 0) { vfs_mount_error(mp, "illegal negnametimeo: %s", opt); error = EINVAL; goto out; } } if (vfs_getopt(mp->mnt_optnew, "minorversion", (void **)&opt, NULL) == 0) { ret = sscanf(opt, "%d", &minvers); if (ret != 1 || minvers < 0 || minvers > 1 || (args.flags & NFSMNT_NFSV4) == 0) { vfs_mount_error(mp, "illegal minorversion: %s", opt); error = EINVAL; goto out; } } if (vfs_getopt(mp->mnt_optnew, "sec", (void **) &secname, NULL) == 0) nfs_sec_name(secname, &args.flags); if (mp->mnt_flag & MNT_UPDATE) { struct nfsmount *nmp = VFSTONFS(mp); if (nmp == NULL) { error = EIO; goto out; } /* * If a change from TCP->UDP is done and there are thread(s) * that have I/O RPC(s) in progress with a transfer size * greater than NFS_MAXDGRAMDATA, those thread(s) will be * hung, retrying the RPC(s) forever. Usually these threads * will be seen doing an uninterruptible sleep on wait channel * "nfsreq". */ if (args.sotype == SOCK_DGRAM && nmp->nm_sotype == SOCK_STREAM) tprintf(td->td_proc, LOG_WARNING, "Warning: mount -u that changes TCP->UDP can result in hung threads\n"); /* * When doing an update, we can't change version, * security, switch lockd strategies or change cookie * translation */ args.flags = (args.flags & ~(NFSMNT_NFSV3 | NFSMNT_NFSV4 | NFSMNT_KERB | NFSMNT_INTEGRITY | NFSMNT_PRIVACY | NFSMNT_NOLOCKD /*|NFSMNT_XLATECOOKIE*/)) | (nmp->nm_flag & (NFSMNT_NFSV3 | NFSMNT_NFSV4 | NFSMNT_KERB | NFSMNT_INTEGRITY | NFSMNT_PRIVACY | NFSMNT_NOLOCKD /*|NFSMNT_XLATECOOKIE*/)); nfs_decode_args(mp, nmp, &args, NULL, td->td_ucred, td); goto out; } /* * Make the nfs_ip_paranoia sysctl serve as the default connection * or no-connection mode for those protocols that support * no-connection mode (the flag will be cleared later for protocols * that do not support no-connection mode). This will allow a client * to receive replies from a different IP then the request was * sent to. Note: default value for nfs_ip_paranoia is 1 (paranoid), * not 0. */ if (nfs_ip_paranoia == 0) args.flags |= NFSMNT_NOCONN; if (has_nfs_args_opt != 0) { /* * In the 'nfs_args' case, the pointers in the args * structure are in userland - we copy them in here. */ if (args.fhsize < 0 || args.fhsize > NFSX_V3FHMAX) { vfs_mount_error(mp, "Bad file handle"); error = EINVAL; goto out; } error = copyin((caddr_t)args.fh, (caddr_t)nfh, args.fhsize); if (error != 0) goto out; error = copyinstr(args.hostname, hst, MNAMELEN - 1, &hstlen); if (error != 0) goto out; bzero(&hst[hstlen], MNAMELEN - hstlen); args.hostname = hst; /* getsockaddr() call must be after above copyin() calls */ error = getsockaddr(&nam, (caddr_t)args.addr, args.addrlen); if (error != 0) goto out; } else if (nfs_mount_parse_from(mp->mnt_optnew, &args.hostname, (struct sockaddr_in **)&nam, dirpath, sizeof(dirpath), &dirlen) == 0) { has_nfs_from_opt = 1; bcopy(args.hostname, hst, MNAMELEN); hst[MNAMELEN - 1] = '\0'; /* * This only works with NFSv4 for now. */ args.fhsize = 0; args.flags |= NFSMNT_NFSV4; args.sotype = SOCK_STREAM; } else { if (vfs_getopt(mp->mnt_optnew, "fh", (void **)&args.fh, &args.fhsize) == 0) { if (args.fhsize < 0 || args.fhsize > NFSX_FHMAX) { vfs_mount_error(mp, "Bad file handle"); error = EINVAL; goto out; } bcopy(args.fh, nfh, args.fhsize); } else { args.fhsize = 0; } (void) vfs_getopt(mp->mnt_optnew, "hostname", (void **)&args.hostname, &len); if (args.hostname == NULL) { vfs_mount_error(mp, "Invalid hostname"); error = EINVAL; goto out; } bcopy(args.hostname, hst, MNAMELEN); hst[MNAMELEN - 1] = '\0'; } if (vfs_getopt(mp->mnt_optnew, "principal", (void **)&name, NULL) == 0) strlcpy(srvkrbname, name, sizeof (srvkrbname)); else { snprintf(srvkrbname, sizeof (srvkrbname), "nfs@%s", hst); cp = strchr(srvkrbname, ':'); if (cp != NULL) *cp = '\0'; } srvkrbnamelen = strlen(srvkrbname); if (vfs_getopt(mp->mnt_optnew, "gssname", (void **)&name, NULL) == 0) strlcpy(krbname, name, sizeof (krbname)); else krbname[0] = '\0'; krbnamelen = strlen(krbname); if (has_nfs_from_opt == 0) { if (vfs_getopt(mp->mnt_optnew, "dirpath", (void **)&name, NULL) == 0) strlcpy(dirpath, name, sizeof (dirpath)); else dirpath[0] = '\0'; dirlen = strlen(dirpath); } if (has_nfs_args_opt == 0 && has_nfs_from_opt == 0) { if (vfs_getopt(mp->mnt_optnew, "addr", (void **)&args.addr, &args.addrlen) == 0) { if (args.addrlen > SOCK_MAXADDRLEN) { error = ENAMETOOLONG; goto out; } nam = malloc(args.addrlen, M_SONAME, M_WAITOK); bcopy(args.addr, nam, args.addrlen); nam->sa_len = args.addrlen; } else { vfs_mount_error(mp, "No server address"); error = EINVAL; goto out; } } args.fh = nfh; error = mountnfs(&args, mp, nam, hst, krbname, krbnamelen, dirpath, dirlen, srvkrbname, srvkrbnamelen, &vp, td->td_ucred, td, nametimeo, negnametimeo, minvers); out: if (!error) { MNT_ILOCK(mp); mp->mnt_kern_flag |= MNTK_LOOKUP_SHARED | MNTK_NO_IOPF | MNTK_USES_BCACHE; if ((VFSTONFS(mp)->nm_flag & NFSMNT_NFSV4) != 0) mp->mnt_kern_flag |= MNTK_NULL_NOCACHE; MNT_IUNLOCK(mp); } return (error); } /* * VFS Operations. * * mount system call * It seems a bit dumb to copyinstr() the host and path here and then * bcopy() them in mountnfs(), but I wanted to detect errors before * doing the getsockaddr() call because getsockaddr() allocates an mbuf and * an error after that means that I have to release the mbuf. */ /* ARGSUSED */ static int nfs_cmount(struct mntarg *ma, void *data, uint64_t flags) { int error; struct nfs_args args; error = copyin(data, &args, sizeof (struct nfs_args)); if (error) return error; ma = mount_arg(ma, "nfs_args", &args, sizeof args); error = kernel_mount(ma, flags); return (error); } /* * Common code for mount and mountroot */ static int mountnfs(struct nfs_args *argp, struct mount *mp, struct sockaddr *nam, char *hst, u_char *krbname, int krbnamelen, u_char *dirpath, int dirlen, u_char *srvkrbname, int srvkrbnamelen, struct vnode **vpp, struct ucred *cred, struct thread *td, int nametimeo, int negnametimeo, int minvers) { struct nfsmount *nmp; struct nfsnode *np; int error, trycnt, ret; struct nfsvattr nfsva; struct nfsclclient *clp; struct nfsclds *dsp, *tdsp; uint32_t lease; static u_int64_t clval = 0; NFSCL_DEBUG(3, "in mnt\n"); clp = NULL; if (mp->mnt_flag & MNT_UPDATE) { nmp = VFSTONFS(mp); printf("%s: MNT_UPDATE is no longer handled here\n", __func__); FREE(nam, M_SONAME); return (0); } else { MALLOC(nmp, struct nfsmount *, sizeof (struct nfsmount) + krbnamelen + dirlen + srvkrbnamelen + 2, M_NEWNFSMNT, M_WAITOK | M_ZERO); TAILQ_INIT(&nmp->nm_bufq); TAILQ_INIT(&nmp->nm_sess); if (clval == 0) clval = (u_int64_t)nfsboottime.tv_sec; nmp->nm_clval = clval++; nmp->nm_krbnamelen = krbnamelen; nmp->nm_dirpathlen = dirlen; nmp->nm_srvkrbnamelen = srvkrbnamelen; if (td->td_ucred->cr_uid != (uid_t)0) { /* * nm_uid is used to get KerberosV credentials for * the nfsv4 state handling operations if there is * no host based principal set. Use the uid of * this user if not root, since they are doing the * mount. I don't think setting this for root will * work, since root normally does not have user * credentials in a credentials cache. */ nmp->nm_uid = td->td_ucred->cr_uid; } else { /* * Just set to -1, so it won't be used. */ nmp->nm_uid = (uid_t)-1; } /* Copy and null terminate all the names */ if (nmp->nm_krbnamelen > 0) { bcopy(krbname, nmp->nm_krbname, nmp->nm_krbnamelen); nmp->nm_name[nmp->nm_krbnamelen] = '\0'; } if (nmp->nm_dirpathlen > 0) { bcopy(dirpath, NFSMNT_DIRPATH(nmp), nmp->nm_dirpathlen); nmp->nm_name[nmp->nm_krbnamelen + nmp->nm_dirpathlen + 1] = '\0'; } if (nmp->nm_srvkrbnamelen > 0) { bcopy(srvkrbname, NFSMNT_SRVKRBNAME(nmp), nmp->nm_srvkrbnamelen); nmp->nm_name[nmp->nm_krbnamelen + nmp->nm_dirpathlen + nmp->nm_srvkrbnamelen + 2] = '\0'; } nmp->nm_sockreq.nr_cred = crhold(cred); mtx_init(&nmp->nm_sockreq.nr_mtx, "nfssock", NULL, MTX_DEF); mp->mnt_data = nmp; nmp->nm_getinfo = nfs_getnlminfo; nmp->nm_vinvalbuf = ncl_vinvalbuf; } vfs_getnewfsid(mp); nmp->nm_mountp = mp; mtx_init(&nmp->nm_mtx, "NFSmount lock", NULL, MTX_DEF | MTX_DUPOK); /* * Since nfs_decode_args() might optionally set them, these * need to be set to defaults before the call, so that the * optional settings aren't overwritten. */ nmp->nm_nametimeo = nametimeo; nmp->nm_negnametimeo = negnametimeo; nmp->nm_timeo = NFS_TIMEO; nmp->nm_retry = NFS_RETRANS; nmp->nm_readahead = NFS_DEFRAHEAD; /* This is empirical approximation of sqrt(hibufspace) * 256. */ nmp->nm_wcommitsize = NFS_MAXBSIZE / 256; while ((long)nmp->nm_wcommitsize * nmp->nm_wcommitsize < hibufspace) nmp->nm_wcommitsize *= 2; nmp->nm_wcommitsize *= 256; if ((argp->flags & NFSMNT_NFSV4) != 0) nmp->nm_minorvers = minvers; else nmp->nm_minorvers = 0; nfs_decode_args(mp, nmp, argp, hst, cred, td); /* * V2 can only handle 32 bit filesizes. A 4GB-1 limit may be too * high, depending on whether we end up with negative offsets in * the client or server somewhere. 2GB-1 may be safer. * * For V3, ncl_fsinfo will adjust this as necessary. Assume maximum * that we can handle until we find out otherwise. */ if ((argp->flags & (NFSMNT_NFSV3 | NFSMNT_NFSV4)) == 0) nmp->nm_maxfilesize = 0xffffffffLL; else nmp->nm_maxfilesize = OFF_MAX; if ((argp->flags & (NFSMNT_NFSV3 | NFSMNT_NFSV4)) == 0) { nmp->nm_wsize = NFS_WSIZE; nmp->nm_rsize = NFS_RSIZE; nmp->nm_readdirsize = NFS_READDIRSIZE; } nmp->nm_numgrps = NFS_MAXGRPS; nmp->nm_tprintf_delay = nfs_tprintf_delay; if (nmp->nm_tprintf_delay < 0) nmp->nm_tprintf_delay = 0; nmp->nm_tprintf_initial_delay = nfs_tprintf_initial_delay; if (nmp->nm_tprintf_initial_delay < 0) nmp->nm_tprintf_initial_delay = 0; nmp->nm_fhsize = argp->fhsize; if (nmp->nm_fhsize > 0) bcopy((caddr_t)argp->fh, (caddr_t)nmp->nm_fh, argp->fhsize); bcopy(hst, mp->mnt_stat.f_mntfromname, MNAMELEN); nmp->nm_nam = nam; /* Set up the sockets and per-host congestion */ nmp->nm_sotype = argp->sotype; nmp->nm_soproto = argp->proto; nmp->nm_sockreq.nr_prog = NFS_PROG; if ((argp->flags & NFSMNT_NFSV4)) nmp->nm_sockreq.nr_vers = NFS_VER4; else if ((argp->flags & NFSMNT_NFSV3)) nmp->nm_sockreq.nr_vers = NFS_VER3; else nmp->nm_sockreq.nr_vers = NFS_VER2; if ((error = newnfs_connect(nmp, &nmp->nm_sockreq, cred, td, 0))) goto bad; /* For NFSv4.1, get the clientid now. */ if (nmp->nm_minorvers > 0) { NFSCL_DEBUG(3, "at getcl\n"); error = nfscl_getcl(mp, cred, td, 0, &clp); NFSCL_DEBUG(3, "aft getcl=%d\n", error); if (error != 0) goto bad; } if (nmp->nm_fhsize == 0 && (nmp->nm_flag & NFSMNT_NFSV4) && nmp->nm_dirpathlen > 0) { NFSCL_DEBUG(3, "in dirp\n"); /* * If the fhsize on the mount point == 0 for V4, the mount * path needs to be looked up. */ trycnt = 3; do { error = nfsrpc_getdirpath(nmp, NFSMNT_DIRPATH(nmp), cred, td); NFSCL_DEBUG(3, "aft dirp=%d\n", error); if (error) (void) nfs_catnap(PZERO, error, "nfsgetdirp"); } while (error && --trycnt > 0); if (error) { error = nfscl_maperr(td, error, (uid_t)0, (gid_t)0); goto bad; } } /* * A reference count is needed on the nfsnode representing the * remote root. If this object is not persistent, then backward * traversals of the mount point (i.e. "..") will not work if * the nfsnode gets flushed out of the cache. Ufs does not have * this problem, because one can identify root inodes by their - * number == ROOTINO (2). + * number == UFS_ROOTINO (2). */ if (nmp->nm_fhsize > 0) { /* * Set f_iosize to NFS_DIRBLKSIZ so that bo_bsize gets set * non-zero for the root vnode. f_iosize will be set correctly * by nfs_statfs() before any I/O occurs. */ mp->mnt_stat.f_iosize = NFS_DIRBLKSIZ; error = ncl_nget(mp, nmp->nm_fh, nmp->nm_fhsize, &np, LK_EXCLUSIVE); if (error) goto bad; *vpp = NFSTOV(np); /* * Get file attributes and transfer parameters for the * mountpoint. This has the side effect of filling in * (*vpp)->v_type with the correct value. */ ret = nfsrpc_getattrnovp(nmp, nmp->nm_fh, nmp->nm_fhsize, 1, cred, td, &nfsva, NULL, &lease); if (ret) { /* * Just set default values to get things going. */ NFSBZERO((caddr_t)&nfsva, sizeof (struct nfsvattr)); nfsva.na_vattr.va_type = VDIR; nfsva.na_vattr.va_mode = 0777; nfsva.na_vattr.va_nlink = 100; nfsva.na_vattr.va_uid = (uid_t)0; nfsva.na_vattr.va_gid = (gid_t)0; nfsva.na_vattr.va_fileid = 2; nfsva.na_vattr.va_gen = 1; nfsva.na_vattr.va_blocksize = NFS_FABLKSIZE; nfsva.na_vattr.va_size = 512 * 1024; lease = 60; } (void) nfscl_loadattrcache(vpp, &nfsva, NULL, NULL, 0, 1); if (nmp->nm_minorvers > 0) { NFSCL_DEBUG(3, "lease=%d\n", (int)lease); NFSLOCKCLSTATE(); clp->nfsc_renew = NFSCL_RENEW(lease); clp->nfsc_expire = NFSD_MONOSEC + clp->nfsc_renew; clp->nfsc_clientidrev++; if (clp->nfsc_clientidrev == 0) clp->nfsc_clientidrev++; NFSUNLOCKCLSTATE(); /* * Mount will succeed, so the renew thread can be * started now. */ nfscl_start_renewthread(clp); nfscl_clientrelease(clp); } if (argp->flags & NFSMNT_NFSV3) ncl_fsinfo(nmp, *vpp, cred, td); /* Mark if the mount point supports NFSv4 ACLs. */ if ((argp->flags & NFSMNT_NFSV4) != 0 && nfsrv_useacl != 0 && ret == 0 && NFSISSET_ATTRBIT(&nfsva.na_suppattr, NFSATTRBIT_ACL)) { MNT_ILOCK(mp); mp->mnt_flag |= MNT_NFS4ACLS; MNT_IUNLOCK(mp); } /* * Lose the lock but keep the ref. */ NFSVOPUNLOCK(*vpp, 0); return (0); } error = EIO; bad: if (clp != NULL) nfscl_clientrelease(clp); newnfs_disconnect(&nmp->nm_sockreq); crfree(nmp->nm_sockreq.nr_cred); if (nmp->nm_sockreq.nr_auth != NULL) AUTH_DESTROY(nmp->nm_sockreq.nr_auth); mtx_destroy(&nmp->nm_sockreq.nr_mtx); mtx_destroy(&nmp->nm_mtx); if (nmp->nm_clp != NULL) { NFSLOCKCLSTATE(); LIST_REMOVE(nmp->nm_clp, nfsc_list); NFSUNLOCKCLSTATE(); free(nmp->nm_clp, M_NFSCLCLIENT); } TAILQ_FOREACH_SAFE(dsp, &nmp->nm_sess, nfsclds_list, tdsp) nfscl_freenfsclds(dsp); FREE(nmp, M_NEWNFSMNT); FREE(nam, M_SONAME); return (error); } /* * unmount system call */ static int nfs_unmount(struct mount *mp, int mntflags) { struct thread *td; struct nfsmount *nmp; int error, flags = 0, i, trycnt = 0; struct nfsclds *dsp, *tdsp; td = curthread; if (mntflags & MNT_FORCE) flags |= FORCECLOSE; nmp = VFSTONFS(mp); /* * Goes something like this.. * - Call vflush() to clear out vnodes for this filesystem * - Close the socket * - Free up the data structures */ /* In the forced case, cancel any outstanding requests. */ if (mntflags & MNT_FORCE) { error = newnfs_nmcancelreqs(nmp); if (error) goto out; /* For a forced close, get rid of the renew thread now */ nfscl_umount(nmp, td); } /* We hold 1 extra ref on the root vnode; see comment in mountnfs(). */ do { error = vflush(mp, 1, flags, td); if ((mntflags & MNT_FORCE) && error != 0 && ++trycnt < 30) (void) nfs_catnap(PSOCK, error, "newndm"); } while ((mntflags & MNT_FORCE) && error != 0 && trycnt < 30); if (error) goto out; /* * We are now committed to the unmount. */ if ((mntflags & MNT_FORCE) == 0) nfscl_umount(nmp, td); /* Make sure no nfsiods are assigned to this mount. */ mtx_lock(&ncl_iod_mutex); for (i = 0; i < NFS_MAXASYNCDAEMON; i++) if (ncl_iodmount[i] == nmp) { ncl_iodwant[i] = NFSIOD_AVAILABLE; ncl_iodmount[i] = NULL; } mtx_unlock(&ncl_iod_mutex); newnfs_disconnect(&nmp->nm_sockreq); crfree(nmp->nm_sockreq.nr_cred); FREE(nmp->nm_nam, M_SONAME); if (nmp->nm_sockreq.nr_auth != NULL) AUTH_DESTROY(nmp->nm_sockreq.nr_auth); mtx_destroy(&nmp->nm_sockreq.nr_mtx); mtx_destroy(&nmp->nm_mtx); TAILQ_FOREACH_SAFE(dsp, &nmp->nm_sess, nfsclds_list, tdsp) nfscl_freenfsclds(dsp); FREE(nmp, M_NEWNFSMNT); out: return (error); } /* * Return root of a filesystem */ static int nfs_root(struct mount *mp, int flags, struct vnode **vpp) { struct vnode *vp; struct nfsmount *nmp; struct nfsnode *np; int error; nmp = VFSTONFS(mp); error = ncl_nget(mp, nmp->nm_fh, nmp->nm_fhsize, &np, flags); if (error) return error; vp = NFSTOV(np); /* * Get transfer parameters and attributes for root vnode once. */ mtx_lock(&nmp->nm_mtx); if (NFSHASNFSV3(nmp) && !NFSHASGOTFSINFO(nmp)) { mtx_unlock(&nmp->nm_mtx); ncl_fsinfo(nmp, vp, curthread->td_ucred, curthread); } else mtx_unlock(&nmp->nm_mtx); if (vp->v_type == VNON) vp->v_type = VDIR; vp->v_vflag |= VV_ROOT; *vpp = vp; return (0); } /* * Flush out the buffer cache */ /* ARGSUSED */ static int nfs_sync(struct mount *mp, int waitfor) { struct vnode *vp, *mvp; struct thread *td; int error, allerror = 0; td = curthread; MNT_ILOCK(mp); /* * If a forced dismount is in progress, return from here so that * the umount(2) syscall doesn't get stuck in VFS_SYNC() before * calling VFS_UNMOUNT(). */ if ((mp->mnt_kern_flag & MNTK_UNMOUNTF) != 0) { MNT_IUNLOCK(mp); return (EBADF); } MNT_IUNLOCK(mp); /* * Force stale buffer cache information to be flushed. */ loop: MNT_VNODE_FOREACH_ALL(vp, mp, mvp) { /* XXX Racy bv_cnt check. */ if (NFSVOPISLOCKED(vp) || vp->v_bufobj.bo_dirty.bv_cnt == 0 || waitfor == MNT_LAZY) { VI_UNLOCK(vp); continue; } if (vget(vp, LK_EXCLUSIVE | LK_INTERLOCK, td)) { MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp); goto loop; } error = VOP_FSYNC(vp, waitfor, td); if (error) allerror = error; NFSVOPUNLOCK(vp, 0); vrele(vp); } return (allerror); } static int nfs_sysctl(struct mount *mp, fsctlop_t op, struct sysctl_req *req) { struct nfsmount *nmp = VFSTONFS(mp); struct vfsquery vq; int error; bzero(&vq, sizeof(vq)); switch (op) { #if 0 case VFS_CTL_NOLOCKS: val = (nmp->nm_flag & NFSMNT_NOLOCKS) ? 1 : 0; if (req->oldptr != NULL) { error = SYSCTL_OUT(req, &val, sizeof(val)); if (error) return (error); } if (req->newptr != NULL) { error = SYSCTL_IN(req, &val, sizeof(val)); if (error) return (error); if (val) nmp->nm_flag |= NFSMNT_NOLOCKS; else nmp->nm_flag &= ~NFSMNT_NOLOCKS; } break; #endif case VFS_CTL_QUERY: mtx_lock(&nmp->nm_mtx); if (nmp->nm_state & NFSSTA_TIMEO) vq.vq_flags |= VQ_NOTRESP; mtx_unlock(&nmp->nm_mtx); #if 0 if (!(nmp->nm_flag & NFSMNT_NOLOCKS) && (nmp->nm_state & NFSSTA_LOCKTIMEO)) vq.vq_flags |= VQ_NOTRESPLOCK; #endif error = SYSCTL_OUT(req, &vq, sizeof(vq)); break; case VFS_CTL_TIMEO: if (req->oldptr != NULL) { error = SYSCTL_OUT(req, &nmp->nm_tprintf_initial_delay, sizeof(nmp->nm_tprintf_initial_delay)); if (error) return (error); } if (req->newptr != NULL) { error = vfs_suser(mp, req->td); if (error) return (error); error = SYSCTL_IN(req, &nmp->nm_tprintf_initial_delay, sizeof(nmp->nm_tprintf_initial_delay)); if (error) return (error); if (nmp->nm_tprintf_initial_delay < 0) nmp->nm_tprintf_initial_delay = 0; } break; default: return (ENOTSUP); } return (0); } /* * Purge any RPCs in progress, so that they will all return errors. * This allows dounmount() to continue as far as VFS_UNMOUNT() for a * forced dismount. */ static void nfs_purge(struct mount *mp) { struct nfsmount *nmp = VFSTONFS(mp); newnfs_nmcancelreqs(nmp); } /* * Extract the information needed by the nlm from the nfs vnode. */ static void nfs_getnlminfo(struct vnode *vp, uint8_t *fhp, size_t *fhlenp, struct sockaddr_storage *sp, int *is_v3p, off_t *sizep, struct timeval *timeop) { struct nfsmount *nmp; struct nfsnode *np = VTONFS(vp); nmp = VFSTONFS(vp->v_mount); if (fhlenp != NULL) *fhlenp = (size_t)np->n_fhp->nfh_len; if (fhp != NULL) bcopy(np->n_fhp->nfh_fh, fhp, np->n_fhp->nfh_len); if (sp != NULL) bcopy(nmp->nm_nam, sp, min(nmp->nm_nam->sa_len, sizeof(*sp))); if (is_v3p != NULL) *is_v3p = NFS_ISV3(vp); if (sizep != NULL) *sizep = np->n_size; if (timeop != NULL) { timeop->tv_sec = nmp->nm_timeo / NFS_HZ; timeop->tv_usec = (nmp->nm_timeo % NFS_HZ) * (1000000 / NFS_HZ); } } /* * This function prints out an option name, based on the conditional * argument. */ static __inline void nfscl_printopt(struct nfsmount *nmp, int testval, char *opt, char **buf, size_t *blen) { int len; if (testval != 0 && *blen > strlen(opt)) { len = snprintf(*buf, *blen, "%s", opt); if (len != strlen(opt)) printf("EEK!!\n"); *buf += len; *blen -= len; } } /* * This function printf out an options integer value. */ static __inline void nfscl_printoptval(struct nfsmount *nmp, int optval, char *opt, char **buf, size_t *blen) { int len; if (*blen > strlen(opt) + 1) { /* Could result in truncated output string. */ len = snprintf(*buf, *blen, "%s=%d", opt, optval); if (len < *blen) { *buf += len; *blen -= len; } } } /* * Load the option flags and values into the buffer. */ void nfscl_retopts(struct nfsmount *nmp, char *buffer, size_t buflen) { char *buf; size_t blen; buf = buffer; blen = buflen; nfscl_printopt(nmp, (nmp->nm_flag & NFSMNT_NFSV4) != 0, "nfsv4", &buf, &blen); if ((nmp->nm_flag & NFSMNT_NFSV4) != 0) { nfscl_printoptval(nmp, nmp->nm_minorvers, ",minorversion", &buf, &blen); nfscl_printopt(nmp, (nmp->nm_flag & NFSMNT_PNFS) != 0, ",pnfs", &buf, &blen); } nfscl_printopt(nmp, (nmp->nm_flag & NFSMNT_NFSV3) != 0, "nfsv3", &buf, &blen); nfscl_printopt(nmp, (nmp->nm_flag & (NFSMNT_NFSV3 | NFSMNT_NFSV4)) == 0, "nfsv2", &buf, &blen); nfscl_printopt(nmp, nmp->nm_sotype == SOCK_STREAM, ",tcp", &buf, &blen); nfscl_printopt(nmp, nmp->nm_sotype != SOCK_STREAM, ",udp", &buf, &blen); nfscl_printopt(nmp, (nmp->nm_flag & NFSMNT_RESVPORT) != 0, ",resvport", &buf, &blen); nfscl_printopt(nmp, (nmp->nm_flag & NFSMNT_NOCONN) != 0, ",noconn", &buf, &blen); nfscl_printopt(nmp, (nmp->nm_flag & NFSMNT_SOFT) == 0, ",hard", &buf, &blen); nfscl_printopt(nmp, (nmp->nm_flag & NFSMNT_SOFT) != 0, ",soft", &buf, &blen); nfscl_printopt(nmp, (nmp->nm_flag & NFSMNT_INT) != 0, ",intr", &buf, &blen); nfscl_printopt(nmp, (nmp->nm_flag & NFSMNT_NOCTO) == 0, ",cto", &buf, &blen); nfscl_printopt(nmp, (nmp->nm_flag & NFSMNT_NOCTO) != 0, ",nocto", &buf, &blen); nfscl_printopt(nmp, (nmp->nm_flag & NFSMNT_NONCONTIGWR) != 0, ",noncontigwr", &buf, &blen); nfscl_printopt(nmp, (nmp->nm_flag & (NFSMNT_NOLOCKD | NFSMNT_NFSV4)) == 0, ",lockd", &buf, &blen); nfscl_printopt(nmp, (nmp->nm_flag & (NFSMNT_NOLOCKD | NFSMNT_NFSV4)) == NFSMNT_NOLOCKD, ",nolockd", &buf, &blen); nfscl_printopt(nmp, (nmp->nm_flag & NFSMNT_RDIRPLUS) != 0, ",rdirplus", &buf, &blen); nfscl_printopt(nmp, (nmp->nm_flag & NFSMNT_KERB) == 0, ",sec=sys", &buf, &blen); nfscl_printopt(nmp, (nmp->nm_flag & (NFSMNT_KERB | NFSMNT_INTEGRITY | NFSMNT_PRIVACY)) == NFSMNT_KERB, ",sec=krb5", &buf, &blen); nfscl_printopt(nmp, (nmp->nm_flag & (NFSMNT_KERB | NFSMNT_INTEGRITY | NFSMNT_PRIVACY)) == (NFSMNT_KERB | NFSMNT_INTEGRITY), ",sec=krb5i", &buf, &blen); nfscl_printopt(nmp, (nmp->nm_flag & (NFSMNT_KERB | NFSMNT_INTEGRITY | NFSMNT_PRIVACY)) == (NFSMNT_KERB | NFSMNT_PRIVACY), ",sec=krb5p", &buf, &blen); nfscl_printoptval(nmp, nmp->nm_acdirmin, ",acdirmin", &buf, &blen); nfscl_printoptval(nmp, nmp->nm_acdirmax, ",acdirmax", &buf, &blen); nfscl_printoptval(nmp, nmp->nm_acregmin, ",acregmin", &buf, &blen); nfscl_printoptval(nmp, nmp->nm_acregmax, ",acregmax", &buf, &blen); nfscl_printoptval(nmp, nmp->nm_nametimeo, ",nametimeo", &buf, &blen); nfscl_printoptval(nmp, nmp->nm_negnametimeo, ",negnametimeo", &buf, &blen); nfscl_printoptval(nmp, nmp->nm_rsize, ",rsize", &buf, &blen); nfscl_printoptval(nmp, nmp->nm_wsize, ",wsize", &buf, &blen); nfscl_printoptval(nmp, nmp->nm_readdirsize, ",readdirsize", &buf, &blen); nfscl_printoptval(nmp, nmp->nm_readahead, ",readahead", &buf, &blen); nfscl_printoptval(nmp, nmp->nm_wcommitsize, ",wcommitsize", &buf, &blen); nfscl_printoptval(nmp, nmp->nm_timeo, ",timeout", &buf, &blen); nfscl_printoptval(nmp, nmp->nm_retry, ",retrans", &buf, &blen); } Index: head/sys/ufs/ffs/ffs_alloc.c =================================================================== --- head/sys/ufs/ffs/ffs_alloc.c (revision 313779) +++ head/sys/ufs/ffs/ffs_alloc.c (revision 313780) @@ -1,3223 +1,3223 @@ /*- * Copyright (c) 2002 Networks Associates Technology, Inc. * All rights reserved. * * This software was developed for the FreeBSD Project by Marshall * Kirk McKusick and Network Associates Laboratories, the Security * Research Division of Network Associates, Inc. under DARPA/SPAWAR * contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA CHATS * research program * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * Copyright (c) 1982, 1986, 1989, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ffs_alloc.c 8.18 (Berkeley) 5/26/95 */ #include __FBSDID("$FreeBSD$"); #include "opt_quota.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include typedef ufs2_daddr_t allocfcn_t(struct inode *ip, u_int cg, ufs2_daddr_t bpref, int size, int rsize); static ufs2_daddr_t ffs_alloccg(struct inode *, u_int, ufs2_daddr_t, int, int); static ufs2_daddr_t ffs_alloccgblk(struct inode *, struct buf *, ufs2_daddr_t, int); static void ffs_blkfree_cg(struct ufsmount *, struct fs *, struct vnode *, ufs2_daddr_t, long, ino_t, struct workhead *); static void ffs_blkfree_trim_completed(struct bio *); static void ffs_blkfree_trim_task(void *ctx, int pending __unused); #ifdef INVARIANTS static int ffs_checkblk(struct inode *, ufs2_daddr_t, long); #endif static ufs2_daddr_t ffs_clusteralloc(struct inode *, u_int, ufs2_daddr_t, int); static ino_t ffs_dirpref(struct inode *); static ufs2_daddr_t ffs_fragextend(struct inode *, u_int, ufs2_daddr_t, int, int); static ufs2_daddr_t ffs_hashalloc (struct inode *, u_int, ufs2_daddr_t, int, int, allocfcn_t *); static ufs2_daddr_t ffs_nodealloccg(struct inode *, u_int, ufs2_daddr_t, int, int); static ufs1_daddr_t ffs_mapsearch(struct fs *, struct cg *, ufs2_daddr_t, int); static int ffs_reallocblks_ufs1(struct vop_reallocblks_args *); static int ffs_reallocblks_ufs2(struct vop_reallocblks_args *); /* * Allocate a block in the filesystem. * * The size of the requested block is given, which must be some * multiple of fs_fsize and <= fs_bsize. * A preference may be optionally specified. If a preference is given * the following hierarchy is used to allocate a block: * 1) allocate the requested block. * 2) allocate a rotationally optimal block in the same cylinder. * 3) allocate a block in the same cylinder group. * 4) quadradically rehash into other cylinder groups, until an * available block is located. * If no block preference is given the following hierarchy is used * to allocate a block: * 1) allocate a block in the cylinder group that contains the * inode for the file. * 2) quadradically rehash into other cylinder groups, until an * available block is located. */ int ffs_alloc(ip, lbn, bpref, size, flags, cred, bnp) struct inode *ip; ufs2_daddr_t lbn, bpref; int size, flags; struct ucred *cred; ufs2_daddr_t *bnp; { struct fs *fs; struct ufsmount *ump; ufs2_daddr_t bno; u_int cg, reclaimed; static struct timeval lastfail; static int curfail; int64_t delta; #ifdef QUOTA int error; #endif *bnp = 0; ump = ITOUMP(ip); fs = ump->um_fs; mtx_assert(UFS_MTX(ump), MA_OWNED); #ifdef INVARIANTS if ((u_int)size > fs->fs_bsize || fragoff(fs, size) != 0) { printf("dev = %s, bsize = %ld, size = %d, fs = %s\n", devtoname(ump->um_dev), (long)fs->fs_bsize, size, fs->fs_fsmnt); panic("ffs_alloc: bad size"); } if (cred == NOCRED) panic("ffs_alloc: missing credential"); #endif /* INVARIANTS */ reclaimed = 0; retry: #ifdef QUOTA UFS_UNLOCK(ump); error = chkdq(ip, btodb(size), cred, 0); if (error) return (error); UFS_LOCK(ump); #endif if (size == fs->fs_bsize && fs->fs_cstotal.cs_nbfree == 0) goto nospace; if (priv_check_cred(cred, PRIV_VFS_BLOCKRESERVE, 0) && freespace(fs, fs->fs_minfree) - numfrags(fs, size) < 0) goto nospace; if (bpref >= fs->fs_size) bpref = 0; if (bpref == 0) cg = ino_to_cg(fs, ip->i_number); else cg = dtog(fs, bpref); bno = ffs_hashalloc(ip, cg, bpref, size, size, ffs_alloccg); if (bno > 0) { delta = btodb(size); DIP_SET(ip, i_blocks, DIP(ip, i_blocks) + delta); if (flags & IO_EXT) ip->i_flag |= IN_CHANGE; else ip->i_flag |= IN_CHANGE | IN_UPDATE; *bnp = bno; return (0); } nospace: #ifdef QUOTA UFS_UNLOCK(ump); /* * Restore user's disk quota because allocation failed. */ (void) chkdq(ip, -btodb(size), cred, FORCE); UFS_LOCK(ump); #endif if (reclaimed == 0 && (flags & IO_BUFLOCKED) == 0) { reclaimed = 1; softdep_request_cleanup(fs, ITOV(ip), cred, FLUSH_BLOCKS_WAIT); goto retry; } UFS_UNLOCK(ump); if (reclaimed > 0 && ppsratecheck(&lastfail, &curfail, 1)) { ffs_fserr(fs, ip->i_number, "filesystem full"); uprintf("\n%s: write failed, filesystem is full\n", fs->fs_fsmnt); } return (ENOSPC); } /* * Reallocate a fragment to a bigger size * * The number and size of the old block is given, and a preference * and new size is also specified. The allocator attempts to extend * the original block. Failing that, the regular block allocator is * invoked to get an appropriate block. */ int ffs_realloccg(ip, lbprev, bprev, bpref, osize, nsize, flags, cred, bpp) struct inode *ip; ufs2_daddr_t lbprev; ufs2_daddr_t bprev; ufs2_daddr_t bpref; int osize, nsize, flags; struct ucred *cred; struct buf **bpp; { struct vnode *vp; struct fs *fs; struct buf *bp; struct ufsmount *ump; u_int cg, request, reclaimed; int error, gbflags; ufs2_daddr_t bno; static struct timeval lastfail; static int curfail; int64_t delta; vp = ITOV(ip); ump = ITOUMP(ip); fs = ump->um_fs; bp = NULL; gbflags = (flags & BA_UNMAPPED) != 0 ? GB_UNMAPPED : 0; mtx_assert(UFS_MTX(ump), MA_OWNED); #ifdef INVARIANTS if (vp->v_mount->mnt_kern_flag & MNTK_SUSPENDED) panic("ffs_realloccg: allocation on suspended filesystem"); if ((u_int)osize > fs->fs_bsize || fragoff(fs, osize) != 0 || (u_int)nsize > fs->fs_bsize || fragoff(fs, nsize) != 0) { printf( "dev = %s, bsize = %ld, osize = %d, nsize = %d, fs = %s\n", devtoname(ump->um_dev), (long)fs->fs_bsize, osize, nsize, fs->fs_fsmnt); panic("ffs_realloccg: bad size"); } if (cred == NOCRED) panic("ffs_realloccg: missing credential"); #endif /* INVARIANTS */ reclaimed = 0; retry: if (priv_check_cred(cred, PRIV_VFS_BLOCKRESERVE, 0) && freespace(fs, fs->fs_minfree) - numfrags(fs, nsize - osize) < 0) { goto nospace; } if (bprev == 0) { printf("dev = %s, bsize = %ld, bprev = %jd, fs = %s\n", devtoname(ump->um_dev), (long)fs->fs_bsize, (intmax_t)bprev, fs->fs_fsmnt); panic("ffs_realloccg: bad bprev"); } UFS_UNLOCK(ump); /* * Allocate the extra space in the buffer. */ error = bread_gb(vp, lbprev, osize, NOCRED, gbflags, &bp); if (error) { brelse(bp); return (error); } if (bp->b_blkno == bp->b_lblkno) { - if (lbprev >= NDADDR) + if (lbprev >= UFS_NDADDR) panic("ffs_realloccg: lbprev out of range"); bp->b_blkno = fsbtodb(fs, bprev); } #ifdef QUOTA error = chkdq(ip, btodb(nsize - osize), cred, 0); if (error) { brelse(bp); return (error); } #endif /* * Check for extension in the existing location. */ *bpp = NULL; cg = dtog(fs, bprev); UFS_LOCK(ump); bno = ffs_fragextend(ip, cg, bprev, osize, nsize); if (bno) { if (bp->b_blkno != fsbtodb(fs, bno)) panic("ffs_realloccg: bad blockno"); delta = btodb(nsize - osize); DIP_SET(ip, i_blocks, DIP(ip, i_blocks) + delta); if (flags & IO_EXT) ip->i_flag |= IN_CHANGE; else ip->i_flag |= IN_CHANGE | IN_UPDATE; allocbuf(bp, nsize); bp->b_flags |= B_DONE; vfs_bio_bzero_buf(bp, osize, nsize - osize); if ((bp->b_flags & (B_MALLOC | B_VMIO)) == B_VMIO) vfs_bio_set_valid(bp, osize, nsize - osize); *bpp = bp; return (0); } /* * Allocate a new disk location. */ if (bpref >= fs->fs_size) bpref = 0; switch ((int)fs->fs_optim) { case FS_OPTSPACE: /* * Allocate an exact sized fragment. Although this makes * best use of space, we will waste time relocating it if * the file continues to grow. If the fragmentation is * less than half of the minimum free reserve, we choose * to begin optimizing for time. */ request = nsize; if (fs->fs_minfree <= 5 || fs->fs_cstotal.cs_nffree > (off_t)fs->fs_dsize * fs->fs_minfree / (2 * 100)) break; log(LOG_NOTICE, "%s: optimization changed from SPACE to TIME\n", fs->fs_fsmnt); fs->fs_optim = FS_OPTTIME; break; case FS_OPTTIME: /* * At this point we have discovered a file that is trying to * grow a small fragment to a larger fragment. To save time, * we allocate a full sized block, then free the unused portion. * If the file continues to grow, the `ffs_fragextend' call * above will be able to grow it in place without further * copying. If aberrant programs cause disk fragmentation to * grow within 2% of the free reserve, we choose to begin * optimizing for space. */ request = fs->fs_bsize; if (fs->fs_cstotal.cs_nffree < (off_t)fs->fs_dsize * (fs->fs_minfree - 2) / 100) break; log(LOG_NOTICE, "%s: optimization changed from TIME to SPACE\n", fs->fs_fsmnt); fs->fs_optim = FS_OPTSPACE; break; default: printf("dev = %s, optim = %ld, fs = %s\n", devtoname(ump->um_dev), (long)fs->fs_optim, fs->fs_fsmnt); panic("ffs_realloccg: bad optim"); /* NOTREACHED */ } bno = ffs_hashalloc(ip, cg, bpref, request, nsize, ffs_alloccg); if (bno > 0) { bp->b_blkno = fsbtodb(fs, bno); if (!DOINGSOFTDEP(vp)) ffs_blkfree(ump, fs, ump->um_devvp, bprev, (long)osize, ip->i_number, vp->v_type, NULL); delta = btodb(nsize - osize); DIP_SET(ip, i_blocks, DIP(ip, i_blocks) + delta); if (flags & IO_EXT) ip->i_flag |= IN_CHANGE; else ip->i_flag |= IN_CHANGE | IN_UPDATE; allocbuf(bp, nsize); bp->b_flags |= B_DONE; vfs_bio_bzero_buf(bp, osize, nsize - osize); if ((bp->b_flags & (B_MALLOC | B_VMIO)) == B_VMIO) vfs_bio_set_valid(bp, osize, nsize - osize); *bpp = bp; return (0); } #ifdef QUOTA UFS_UNLOCK(ump); /* * Restore user's disk quota because allocation failed. */ (void) chkdq(ip, -btodb(nsize - osize), cred, FORCE); UFS_LOCK(ump); #endif nospace: /* * no space available */ if (reclaimed == 0 && (flags & IO_BUFLOCKED) == 0) { reclaimed = 1; UFS_UNLOCK(ump); if (bp) { brelse(bp); bp = NULL; } UFS_LOCK(ump); softdep_request_cleanup(fs, vp, cred, FLUSH_BLOCKS_WAIT); goto retry; } UFS_UNLOCK(ump); if (bp) brelse(bp); if (reclaimed > 0 && ppsratecheck(&lastfail, &curfail, 1)) { ffs_fserr(fs, ip->i_number, "filesystem full"); uprintf("\n%s: write failed, filesystem is full\n", fs->fs_fsmnt); } return (ENOSPC); } /* * Reallocate a sequence of blocks into a contiguous sequence of blocks. * * The vnode and an array of buffer pointers for a range of sequential * logical blocks to be made contiguous is given. The allocator attempts * to find a range of sequential blocks starting as close as possible * from the end of the allocation for the logical block immediately * preceding the current range. If successful, the physical block numbers * in the buffer pointers and in the inode are changed to reflect the new * allocation. If unsuccessful, the allocation is left unchanged. The * success in doing the reallocation is returned. Note that the error * return is not reflected back to the user. Rather the previous block * allocation will be used. */ SYSCTL_NODE(_vfs, OID_AUTO, ffs, CTLFLAG_RW, 0, "FFS filesystem"); static int doasyncfree = 1; SYSCTL_INT(_vfs_ffs, OID_AUTO, doasyncfree, CTLFLAG_RW, &doasyncfree, 0, "do not force synchronous writes when blocks are reallocated"); static int doreallocblks = 1; SYSCTL_INT(_vfs_ffs, OID_AUTO, doreallocblks, CTLFLAG_RW, &doreallocblks, 0, "enable block reallocation"); static int maxclustersearch = 10; SYSCTL_INT(_vfs_ffs, OID_AUTO, maxclustersearch, CTLFLAG_RW, &maxclustersearch, 0, "max number of cylinder group to search for contigous blocks"); #ifdef DEBUG static volatile int prtrealloc = 0; #endif int ffs_reallocblks(ap) struct vop_reallocblks_args /* { struct vnode *a_vp; struct cluster_save *a_buflist; } */ *ap; { struct ufsmount *ump; /* * If the underlying device can do deletes, then skip reallocating * the blocks of this file into contiguous sequences. Devices that * benefit from BIO_DELETE also benefit from not moving the data. * These devices are flash and therefore work less well with this * optimization. Also skip if reallocblks has been disabled globally. */ ump = ap->a_vp->v_mount->mnt_data; if (ump->um_candelete || doreallocblks == 0) return (ENOSPC); /* * We can't wait in softdep prealloc as it may fsync and recurse * here. Instead we simply fail to reallocate blocks if this * rare condition arises. */ if (DOINGSOFTDEP(ap->a_vp)) if (softdep_prealloc(ap->a_vp, MNT_NOWAIT) != 0) return (ENOSPC); if (ump->um_fstype == UFS1) return (ffs_reallocblks_ufs1(ap)); return (ffs_reallocblks_ufs2(ap)); } static int ffs_reallocblks_ufs1(ap) struct vop_reallocblks_args /* { struct vnode *a_vp; struct cluster_save *a_buflist; } */ *ap; { struct fs *fs; struct inode *ip; struct vnode *vp; struct buf *sbp, *ebp; ufs1_daddr_t *bap, *sbap, *ebap; struct cluster_save *buflist; struct ufsmount *ump; ufs_lbn_t start_lbn, end_lbn; ufs1_daddr_t soff, newblk, blkno; ufs2_daddr_t pref; - struct indir start_ap[NIADDR + 1], end_ap[NIADDR + 1], *idp; + struct indir start_ap[UFS_NIADDR + 1], end_ap[UFS_NIADDR + 1], *idp; int i, cg, len, start_lvl, end_lvl, ssize; vp = ap->a_vp; ip = VTOI(vp); ump = ITOUMP(ip); fs = ump->um_fs; /* * If we are not tracking block clusters or if we have less than 4% * free blocks left, then do not attempt to cluster. Running with * less than 5% free block reserve is not recommended and those that * choose to do so do not expect to have good file layout. */ if (fs->fs_contigsumsize <= 0 || freespace(fs, 4) < 0) return (ENOSPC); buflist = ap->a_buflist; len = buflist->bs_nchildren; start_lbn = buflist->bs_children[0]->b_lblkno; end_lbn = start_lbn + len - 1; #ifdef INVARIANTS for (i = 0; i < len; i++) if (!ffs_checkblk(ip, dbtofsb(fs, buflist->bs_children[i]->b_blkno), fs->fs_bsize)) panic("ffs_reallocblks: unallocated block 1"); for (i = 1; i < len; i++) if (buflist->bs_children[i]->b_lblkno != start_lbn + i) panic("ffs_reallocblks: non-logical cluster"); blkno = buflist->bs_children[0]->b_blkno; ssize = fsbtodb(fs, fs->fs_frag); for (i = 1; i < len - 1; i++) if (buflist->bs_children[i]->b_blkno != blkno + (i * ssize)) panic("ffs_reallocblks: non-physical cluster %d", i); #endif /* * If the cluster crosses the boundary for the first indirect * block, leave space for the indirect block. Indirect blocks * are initially laid out in a position after the last direct * block. Block reallocation would usually destroy locality by * moving the indirect block out of the way to make room for * data blocks if we didn't compensate here. We should also do * this for other indirect block boundaries, but it is only * important for the first one. */ - if (start_lbn < NDADDR && end_lbn >= NDADDR) + if (start_lbn < UFS_NDADDR && end_lbn >= UFS_NDADDR) return (ENOSPC); /* * If the latest allocation is in a new cylinder group, assume that * the filesystem has decided to move and do not force it back to * the previous cylinder group. */ if (dtog(fs, dbtofsb(fs, buflist->bs_children[0]->b_blkno)) != dtog(fs, dbtofsb(fs, buflist->bs_children[len - 1]->b_blkno))) return (ENOSPC); if (ufs_getlbns(vp, start_lbn, start_ap, &start_lvl) || ufs_getlbns(vp, end_lbn, end_ap, &end_lvl)) return (ENOSPC); /* * Get the starting offset and block map for the first block. */ if (start_lvl == 0) { sbap = &ip->i_din1->di_db[0]; soff = start_lbn; } else { idp = &start_ap[start_lvl - 1]; if (bread(vp, idp->in_lbn, (int)fs->fs_bsize, NOCRED, &sbp)) { brelse(sbp); return (ENOSPC); } sbap = (ufs1_daddr_t *)sbp->b_data; soff = idp->in_off; } /* * If the block range spans two block maps, get the second map. */ ebap = NULL; if (end_lvl == 0 || (idp = &end_ap[end_lvl - 1])->in_off + 1 >= len) { ssize = len; } else { #ifdef INVARIANTS if (start_lvl > 0 && start_ap[start_lvl - 1].in_lbn == idp->in_lbn) panic("ffs_reallocblk: start == end"); #endif ssize = len - (idp->in_off + 1); if (bread(vp, idp->in_lbn, (int)fs->fs_bsize, NOCRED, &ebp)) goto fail; ebap = (ufs1_daddr_t *)ebp->b_data; } /* * Find the preferred location for the cluster. If we have not * previously failed at this endeavor, then follow our standard * preference calculation. If we have failed at it, then pick up * where we last ended our search. */ UFS_LOCK(ump); if (ip->i_nextclustercg == -1) pref = ffs_blkpref_ufs1(ip, start_lbn, soff, sbap); else pref = cgdata(fs, ip->i_nextclustercg); /* * Search the block map looking for an allocation of the desired size. * To avoid wasting too much time, we limit the number of cylinder * groups that we will search. */ cg = dtog(fs, pref); for (i = min(maxclustersearch, fs->fs_ncg); i > 0; i--) { if ((newblk = ffs_clusteralloc(ip, cg, pref, len)) != 0) break; cg += 1; if (cg >= fs->fs_ncg) cg = 0; } /* * If we have failed in our search, record where we gave up for * next time. Otherwise, fall back to our usual search citerion. */ if (newblk == 0) { ip->i_nextclustercg = cg; UFS_UNLOCK(ump); goto fail; } ip->i_nextclustercg = -1; /* * We have found a new contiguous block. * * First we have to replace the old block pointers with the new * block pointers in the inode and indirect blocks associated * with the file. */ #ifdef DEBUG if (prtrealloc) printf("realloc: ino %ju, lbns %jd-%jd\n\told:", (uintmax_t)ip->i_number, (intmax_t)start_lbn, (intmax_t)end_lbn); #endif blkno = newblk; for (bap = &sbap[soff], i = 0; i < len; i++, blkno += fs->fs_frag) { if (i == ssize) { bap = ebap; soff = -i; } #ifdef INVARIANTS if (!ffs_checkblk(ip, dbtofsb(fs, buflist->bs_children[i]->b_blkno), fs->fs_bsize)) panic("ffs_reallocblks: unallocated block 2"); if (dbtofsb(fs, buflist->bs_children[i]->b_blkno) != *bap) panic("ffs_reallocblks: alloc mismatch"); #endif #ifdef DEBUG if (prtrealloc) printf(" %d,", *bap); #endif if (DOINGSOFTDEP(vp)) { if (sbap == &ip->i_din1->di_db[0] && i < ssize) softdep_setup_allocdirect(ip, start_lbn + i, blkno, *bap, fs->fs_bsize, fs->fs_bsize, buflist->bs_children[i]); else softdep_setup_allocindir_page(ip, start_lbn + i, i < ssize ? sbp : ebp, soff + i, blkno, *bap, buflist->bs_children[i]); } *bap++ = blkno; } /* * Next we must write out the modified inode and indirect blocks. * For strict correctness, the writes should be synchronous since * the old block values may have been written to disk. In practise * they are almost never written, but if we are concerned about * strict correctness, the `doasyncfree' flag should be set to zero. * * The test on `doasyncfree' should be changed to test a flag * that shows whether the associated buffers and inodes have * been written. The flag should be set when the cluster is * started and cleared whenever the buffer or inode is flushed. * We can then check below to see if it is set, and do the * synchronous write only when it has been cleared. */ if (sbap != &ip->i_din1->di_db[0]) { if (doasyncfree) bdwrite(sbp); else bwrite(sbp); } else { ip->i_flag |= IN_CHANGE | IN_UPDATE; if (!doasyncfree) ffs_update(vp, 1); } if (ssize < len) { if (doasyncfree) bdwrite(ebp); else bwrite(ebp); } /* * Last, free the old blocks and assign the new blocks to the buffers. */ #ifdef DEBUG if (prtrealloc) printf("\n\tnew:"); #endif for (blkno = newblk, i = 0; i < len; i++, blkno += fs->fs_frag) { if (!DOINGSOFTDEP(vp)) ffs_blkfree(ump, fs, ump->um_devvp, dbtofsb(fs, buflist->bs_children[i]->b_blkno), fs->fs_bsize, ip->i_number, vp->v_type, NULL); buflist->bs_children[i]->b_blkno = fsbtodb(fs, blkno); #ifdef INVARIANTS if (!ffs_checkblk(ip, dbtofsb(fs, buflist->bs_children[i]->b_blkno), fs->fs_bsize)) panic("ffs_reallocblks: unallocated block 3"); #endif #ifdef DEBUG if (prtrealloc) printf(" %d,", blkno); #endif } #ifdef DEBUG if (prtrealloc) { prtrealloc--; printf("\n"); } #endif return (0); fail: if (ssize < len) brelse(ebp); if (sbap != &ip->i_din1->di_db[0]) brelse(sbp); return (ENOSPC); } static int ffs_reallocblks_ufs2(ap) struct vop_reallocblks_args /* { struct vnode *a_vp; struct cluster_save *a_buflist; } */ *ap; { struct fs *fs; struct inode *ip; struct vnode *vp; struct buf *sbp, *ebp; ufs2_daddr_t *bap, *sbap, *ebap; struct cluster_save *buflist; struct ufsmount *ump; ufs_lbn_t start_lbn, end_lbn; ufs2_daddr_t soff, newblk, blkno, pref; - struct indir start_ap[NIADDR + 1], end_ap[NIADDR + 1], *idp; + struct indir start_ap[UFS_NIADDR + 1], end_ap[UFS_NIADDR + 1], *idp; int i, cg, len, start_lvl, end_lvl, ssize; vp = ap->a_vp; ip = VTOI(vp); ump = ITOUMP(ip); fs = ump->um_fs; /* * If we are not tracking block clusters or if we have less than 4% * free blocks left, then do not attempt to cluster. Running with * less than 5% free block reserve is not recommended and those that * choose to do so do not expect to have good file layout. */ if (fs->fs_contigsumsize <= 0 || freespace(fs, 4) < 0) return (ENOSPC); buflist = ap->a_buflist; len = buflist->bs_nchildren; start_lbn = buflist->bs_children[0]->b_lblkno; end_lbn = start_lbn + len - 1; #ifdef INVARIANTS for (i = 0; i < len; i++) if (!ffs_checkblk(ip, dbtofsb(fs, buflist->bs_children[i]->b_blkno), fs->fs_bsize)) panic("ffs_reallocblks: unallocated block 1"); for (i = 1; i < len; i++) if (buflist->bs_children[i]->b_lblkno != start_lbn + i) panic("ffs_reallocblks: non-logical cluster"); blkno = buflist->bs_children[0]->b_blkno; ssize = fsbtodb(fs, fs->fs_frag); for (i = 1; i < len - 1; i++) if (buflist->bs_children[i]->b_blkno != blkno + (i * ssize)) panic("ffs_reallocblks: non-physical cluster %d", i); #endif /* * If the cluster crosses the boundary for the first indirect * block, do not move anything in it. Indirect blocks are * usually initially laid out in a position between the data * blocks. Block reallocation would usually destroy locality by * moving the indirect block out of the way to make room for * data blocks if we didn't compensate here. We should also do * this for other indirect block boundaries, but it is only * important for the first one. */ - if (start_lbn < NDADDR && end_lbn >= NDADDR) + if (start_lbn < UFS_NDADDR && end_lbn >= UFS_NDADDR) return (ENOSPC); /* * If the latest allocation is in a new cylinder group, assume that * the filesystem has decided to move and do not force it back to * the previous cylinder group. */ if (dtog(fs, dbtofsb(fs, buflist->bs_children[0]->b_blkno)) != dtog(fs, dbtofsb(fs, buflist->bs_children[len - 1]->b_blkno))) return (ENOSPC); if (ufs_getlbns(vp, start_lbn, start_ap, &start_lvl) || ufs_getlbns(vp, end_lbn, end_ap, &end_lvl)) return (ENOSPC); /* * Get the starting offset and block map for the first block. */ if (start_lvl == 0) { sbap = &ip->i_din2->di_db[0]; soff = start_lbn; } else { idp = &start_ap[start_lvl - 1]; if (bread(vp, idp->in_lbn, (int)fs->fs_bsize, NOCRED, &sbp)) { brelse(sbp); return (ENOSPC); } sbap = (ufs2_daddr_t *)sbp->b_data; soff = idp->in_off; } /* * If the block range spans two block maps, get the second map. */ ebap = NULL; if (end_lvl == 0 || (idp = &end_ap[end_lvl - 1])->in_off + 1 >= len) { ssize = len; } else { #ifdef INVARIANTS if (start_lvl > 0 && start_ap[start_lvl - 1].in_lbn == idp->in_lbn) panic("ffs_reallocblk: start == end"); #endif ssize = len - (idp->in_off + 1); if (bread(vp, idp->in_lbn, (int)fs->fs_bsize, NOCRED, &ebp)) goto fail; ebap = (ufs2_daddr_t *)ebp->b_data; } /* * Find the preferred location for the cluster. If we have not * previously failed at this endeavor, then follow our standard * preference calculation. If we have failed at it, then pick up * where we last ended our search. */ UFS_LOCK(ump); if (ip->i_nextclustercg == -1) pref = ffs_blkpref_ufs2(ip, start_lbn, soff, sbap); else pref = cgdata(fs, ip->i_nextclustercg); /* * Search the block map looking for an allocation of the desired size. * To avoid wasting too much time, we limit the number of cylinder * groups that we will search. */ cg = dtog(fs, pref); for (i = min(maxclustersearch, fs->fs_ncg); i > 0; i--) { if ((newblk = ffs_clusteralloc(ip, cg, pref, len)) != 0) break; cg += 1; if (cg >= fs->fs_ncg) cg = 0; } /* * If we have failed in our search, record where we gave up for * next time. Otherwise, fall back to our usual search citerion. */ if (newblk == 0) { ip->i_nextclustercg = cg; UFS_UNLOCK(ump); goto fail; } ip->i_nextclustercg = -1; /* * We have found a new contiguous block. * * First we have to replace the old block pointers with the new * block pointers in the inode and indirect blocks associated * with the file. */ #ifdef DEBUG if (prtrealloc) printf("realloc: ino %ju, lbns %jd-%jd\n\told:", (uintmax_t)ip->i_number, (intmax_t)start_lbn, (intmax_t)end_lbn); #endif blkno = newblk; for (bap = &sbap[soff], i = 0; i < len; i++, blkno += fs->fs_frag) { if (i == ssize) { bap = ebap; soff = -i; } #ifdef INVARIANTS if (!ffs_checkblk(ip, dbtofsb(fs, buflist->bs_children[i]->b_blkno), fs->fs_bsize)) panic("ffs_reallocblks: unallocated block 2"); if (dbtofsb(fs, buflist->bs_children[i]->b_blkno) != *bap) panic("ffs_reallocblks: alloc mismatch"); #endif #ifdef DEBUG if (prtrealloc) printf(" %jd,", (intmax_t)*bap); #endif if (DOINGSOFTDEP(vp)) { if (sbap == &ip->i_din2->di_db[0] && i < ssize) softdep_setup_allocdirect(ip, start_lbn + i, blkno, *bap, fs->fs_bsize, fs->fs_bsize, buflist->bs_children[i]); else softdep_setup_allocindir_page(ip, start_lbn + i, i < ssize ? sbp : ebp, soff + i, blkno, *bap, buflist->bs_children[i]); } *bap++ = blkno; } /* * Next we must write out the modified inode and indirect blocks. * For strict correctness, the writes should be synchronous since * the old block values may have been written to disk. In practise * they are almost never written, but if we are concerned about * strict correctness, the `doasyncfree' flag should be set to zero. * * The test on `doasyncfree' should be changed to test a flag * that shows whether the associated buffers and inodes have * been written. The flag should be set when the cluster is * started and cleared whenever the buffer or inode is flushed. * We can then check below to see if it is set, and do the * synchronous write only when it has been cleared. */ if (sbap != &ip->i_din2->di_db[0]) { if (doasyncfree) bdwrite(sbp); else bwrite(sbp); } else { ip->i_flag |= IN_CHANGE | IN_UPDATE; if (!doasyncfree) ffs_update(vp, 1); } if (ssize < len) { if (doasyncfree) bdwrite(ebp); else bwrite(ebp); } /* * Last, free the old blocks and assign the new blocks to the buffers. */ #ifdef DEBUG if (prtrealloc) printf("\n\tnew:"); #endif for (blkno = newblk, i = 0; i < len; i++, blkno += fs->fs_frag) { if (!DOINGSOFTDEP(vp)) ffs_blkfree(ump, fs, ump->um_devvp, dbtofsb(fs, buflist->bs_children[i]->b_blkno), fs->fs_bsize, ip->i_number, vp->v_type, NULL); buflist->bs_children[i]->b_blkno = fsbtodb(fs, blkno); #ifdef INVARIANTS if (!ffs_checkblk(ip, dbtofsb(fs, buflist->bs_children[i]->b_blkno), fs->fs_bsize)) panic("ffs_reallocblks: unallocated block 3"); #endif #ifdef DEBUG if (prtrealloc) printf(" %jd,", (intmax_t)blkno); #endif } #ifdef DEBUG if (prtrealloc) { prtrealloc--; printf("\n"); } #endif return (0); fail: if (ssize < len) brelse(ebp); if (sbap != &ip->i_din2->di_db[0]) brelse(sbp); return (ENOSPC); } /* * Allocate an inode in the filesystem. * * If allocating a directory, use ffs_dirpref to select the inode. * If allocating in a directory, the following hierarchy is followed: * 1) allocate the preferred inode. * 2) allocate an inode in the same cylinder group. * 3) quadradically rehash into other cylinder groups, until an * available inode is located. * If no inode preference is given the following hierarchy is used * to allocate an inode: * 1) allocate an inode in cylinder group 0. * 2) quadradically rehash into other cylinder groups, until an * available inode is located. */ int ffs_valloc(pvp, mode, cred, vpp) struct vnode *pvp; int mode; struct ucred *cred; struct vnode **vpp; { struct inode *pip; struct fs *fs; struct inode *ip; struct timespec ts; struct ufsmount *ump; ino_t ino, ipref; u_int cg; int error, error1, reclaimed; static struct timeval lastfail; static int curfail; *vpp = NULL; pip = VTOI(pvp); ump = ITOUMP(pip); fs = ump->um_fs; UFS_LOCK(ump); reclaimed = 0; retry: if (fs->fs_cstotal.cs_nifree == 0) goto noinodes; if ((mode & IFMT) == IFDIR) ipref = ffs_dirpref(pip); else ipref = pip->i_number; if (ipref >= fs->fs_ncg * fs->fs_ipg) ipref = 0; cg = ino_to_cg(fs, ipref); /* * Track number of dirs created one after another * in a same cg without intervening by files. */ if ((mode & IFMT) == IFDIR) { if (fs->fs_contigdirs[cg] < 255) fs->fs_contigdirs[cg]++; } else { if (fs->fs_contigdirs[cg] > 0) fs->fs_contigdirs[cg]--; } ino = (ino_t)ffs_hashalloc(pip, cg, ipref, mode, 0, (allocfcn_t *)ffs_nodealloccg); if (ino == 0) goto noinodes; error = ffs_vget(pvp->v_mount, ino, LK_EXCLUSIVE, vpp); if (error) { error1 = ffs_vgetf(pvp->v_mount, ino, LK_EXCLUSIVE, vpp, FFSV_FORCEINSMQ); ffs_vfree(pvp, ino, mode); if (error1 == 0) { ip = VTOI(*vpp); if (ip->i_mode) goto dup_alloc; ip->i_flag |= IN_MODIFIED; vput(*vpp); } return (error); } ip = VTOI(*vpp); if (ip->i_mode) { dup_alloc: printf("mode = 0%o, inum = %ju, fs = %s\n", ip->i_mode, (uintmax_t)ip->i_number, fs->fs_fsmnt); panic("ffs_valloc: dup alloc"); } if (DIP(ip, i_blocks) && (fs->fs_flags & FS_UNCLEAN) == 0) { /* XXX */ printf("free inode %s/%lu had %ld blocks\n", fs->fs_fsmnt, (u_long)ino, (long)DIP(ip, i_blocks)); DIP_SET(ip, i_blocks, 0); } ip->i_flags = 0; DIP_SET(ip, i_flags, 0); /* * Set up a new generation number for this inode. */ while (ip->i_gen == 0 || ++ip->i_gen == 0) ip->i_gen = arc4random(); DIP_SET(ip, i_gen, ip->i_gen); if (fs->fs_magic == FS_UFS2_MAGIC) { vfs_timestamp(&ts); ip->i_din2->di_birthtime = ts.tv_sec; ip->i_din2->di_birthnsec = ts.tv_nsec; } ufs_prepare_reclaim(*vpp); ip->i_flag = 0; (*vpp)->v_vflag = 0; (*vpp)->v_type = VNON; if (fs->fs_magic == FS_UFS2_MAGIC) { (*vpp)->v_op = &ffs_vnodeops2; ip->i_flag |= IN_UFS2; } else { (*vpp)->v_op = &ffs_vnodeops1; } return (0); noinodes: if (reclaimed == 0) { reclaimed = 1; softdep_request_cleanup(fs, pvp, cred, FLUSH_INODES_WAIT); goto retry; } UFS_UNLOCK(ump); if (ppsratecheck(&lastfail, &curfail, 1)) { ffs_fserr(fs, pip->i_number, "out of inodes"); uprintf("\n%s: create/symlink failed, no inodes free\n", fs->fs_fsmnt); } return (ENOSPC); } /* * Find a cylinder group to place a directory. * * The policy implemented by this algorithm is to allocate a * directory inode in the same cylinder group as its parent * directory, but also to reserve space for its files inodes * and data. Restrict the number of directories which may be * allocated one after another in the same cylinder group * without intervening allocation of files. * * If we allocate a first level directory then force allocation * in another cylinder group. */ static ino_t ffs_dirpref(pip) struct inode *pip; { struct fs *fs; int cg, prefcg, dirsize, cgsize; u_int avgifree, avgbfree, avgndir, curdirsize; u_int minifree, minbfree, maxndir; u_int mincg, minndir; u_int maxcontigdirs; mtx_assert(UFS_MTX(ITOUMP(pip)), MA_OWNED); fs = ITOFS(pip); avgifree = fs->fs_cstotal.cs_nifree / fs->fs_ncg; avgbfree = fs->fs_cstotal.cs_nbfree / fs->fs_ncg; avgndir = fs->fs_cstotal.cs_ndir / fs->fs_ncg; /* * Force allocation in another cg if creating a first level dir. */ ASSERT_VOP_LOCKED(ITOV(pip), "ffs_dirpref"); if (ITOV(pip)->v_vflag & VV_ROOT) { prefcg = arc4random() % fs->fs_ncg; mincg = prefcg; minndir = fs->fs_ipg; for (cg = prefcg; cg < fs->fs_ncg; cg++) if (fs->fs_cs(fs, cg).cs_ndir < minndir && fs->fs_cs(fs, cg).cs_nifree >= avgifree && fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) { mincg = cg; minndir = fs->fs_cs(fs, cg).cs_ndir; } for (cg = 0; cg < prefcg; cg++) if (fs->fs_cs(fs, cg).cs_ndir < minndir && fs->fs_cs(fs, cg).cs_nifree >= avgifree && fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) { mincg = cg; minndir = fs->fs_cs(fs, cg).cs_ndir; } return ((ino_t)(fs->fs_ipg * mincg)); } /* * Count various limits which used for * optimal allocation of a directory inode. */ maxndir = min(avgndir + fs->fs_ipg / 16, fs->fs_ipg); minifree = avgifree - avgifree / 4; if (minifree < 1) minifree = 1; minbfree = avgbfree - avgbfree / 4; if (minbfree < 1) minbfree = 1; cgsize = fs->fs_fsize * fs->fs_fpg; dirsize = fs->fs_avgfilesize * fs->fs_avgfpdir; curdirsize = avgndir ? (cgsize - avgbfree * fs->fs_bsize) / avgndir : 0; if (dirsize < curdirsize) dirsize = curdirsize; if (dirsize <= 0) maxcontigdirs = 0; /* dirsize overflowed */ else maxcontigdirs = min((avgbfree * fs->fs_bsize) / dirsize, 255); if (fs->fs_avgfpdir > 0) maxcontigdirs = min(maxcontigdirs, fs->fs_ipg / fs->fs_avgfpdir); if (maxcontigdirs == 0) maxcontigdirs = 1; /* * Limit number of dirs in one cg and reserve space for * regular files, but only if we have no deficit in * inodes or space. * * We are trying to find a suitable cylinder group nearby * our preferred cylinder group to place a new directory. * We scan from our preferred cylinder group forward looking * for a cylinder group that meets our criterion. If we get * to the final cylinder group and do not find anything, * we start scanning forwards from the beginning of the * filesystem. While it might seem sensible to start scanning * backwards or even to alternate looking forward and backward, * this approach fails badly when the filesystem is nearly full. * Specifically, we first search all the areas that have no space * and finally try the one preceding that. We repeat this on * every request and in the case of the final block end up * searching the entire filesystem. By jumping to the front * of the filesystem, our future forward searches always look * in new cylinder groups so finds every possible block after * one pass over the filesystem. */ prefcg = ino_to_cg(fs, pip->i_number); for (cg = prefcg; cg < fs->fs_ncg; cg++) if (fs->fs_cs(fs, cg).cs_ndir < maxndir && fs->fs_cs(fs, cg).cs_nifree >= minifree && fs->fs_cs(fs, cg).cs_nbfree >= minbfree) { if (fs->fs_contigdirs[cg] < maxcontigdirs) return ((ino_t)(fs->fs_ipg * cg)); } for (cg = 0; cg < prefcg; cg++) if (fs->fs_cs(fs, cg).cs_ndir < maxndir && fs->fs_cs(fs, cg).cs_nifree >= minifree && fs->fs_cs(fs, cg).cs_nbfree >= minbfree) { if (fs->fs_contigdirs[cg] < maxcontigdirs) return ((ino_t)(fs->fs_ipg * cg)); } /* * This is a backstop when we have deficit in space. */ for (cg = prefcg; cg < fs->fs_ncg; cg++) if (fs->fs_cs(fs, cg).cs_nifree >= avgifree) return ((ino_t)(fs->fs_ipg * cg)); for (cg = 0; cg < prefcg; cg++) if (fs->fs_cs(fs, cg).cs_nifree >= avgifree) break; return ((ino_t)(fs->fs_ipg * cg)); } /* * Select the desired position for the next block in a file. The file is * logically divided into sections. The first section is composed of the * direct blocks and the next fs_maxbpg blocks. Each additional section * contains fs_maxbpg blocks. * * If no blocks have been allocated in the first section, the policy is to * request a block in the same cylinder group as the inode that describes * the file. The first indirect is allocated immediately following the last * direct block and the data blocks for the first indirect immediately * follow it. * * If no blocks have been allocated in any other section, the indirect * block(s) are allocated in the same cylinder group as its inode in an * area reserved immediately following the inode blocks. The policy for * the data blocks is to place them in a cylinder group with a greater than * average number of free blocks. An appropriate cylinder group is found * by using a rotor that sweeps the cylinder groups. When a new group of * blocks is needed, the sweep begins in the cylinder group following the * cylinder group from which the previous allocation was made. The sweep * continues until a cylinder group with greater than the average number * of free blocks is found. If the allocation is for the first block in an * indirect block or the previous block is a hole, then the information on * the previous allocation is unavailable; here a best guess is made based * on the logical block number being allocated. * * If a section is already partially allocated, the policy is to * allocate blocks contiguously within the section if possible. */ ufs2_daddr_t ffs_blkpref_ufs1(ip, lbn, indx, bap) struct inode *ip; ufs_lbn_t lbn; int indx; ufs1_daddr_t *bap; { struct fs *fs; u_int cg, inocg; u_int avgbfree, startcg; ufs2_daddr_t pref; KASSERT(indx <= 0 || bap != NULL, ("need non-NULL bap")); mtx_assert(UFS_MTX(ITOUMP(ip)), MA_OWNED); fs = ITOFS(ip); /* * Allocation of indirect blocks is indicated by passing negative * values in indx: -1 for single indirect, -2 for double indirect, * -3 for triple indirect. As noted below, we attempt to allocate * the first indirect inline with the file data. For all later * indirect blocks, the data is often allocated in other cylinder * groups. However to speed random file access and to speed up * fsck, the filesystem reserves the first fs_metaspace blocks * (typically half of fs_minfree) of the data area of each cylinder * group to hold these later indirect blocks. */ inocg = ino_to_cg(fs, ip->i_number); if (indx < 0) { /* * Our preference for indirect blocks is the zone at the * beginning of the inode's cylinder group data area that * we try to reserve for indirect blocks. */ pref = cgmeta(fs, inocg); /* * If we are allocating the first indirect block, try to * place it immediately following the last direct block. */ - if (indx == -1 && lbn < NDADDR + NINDIR(fs) && - ip->i_din1->di_db[NDADDR - 1] != 0) - pref = ip->i_din1->di_db[NDADDR - 1] + fs->fs_frag; + if (indx == -1 && lbn < UFS_NDADDR + NINDIR(fs) && + ip->i_din1->di_db[UFS_NDADDR - 1] != 0) + pref = ip->i_din1->di_db[UFS_NDADDR - 1] + fs->fs_frag; return (pref); } /* * If we are allocating the first data block in the first indirect * block and the indirect has been allocated in the data block area, * try to place it immediately following the indirect block. */ - if (lbn == NDADDR) { + if (lbn == UFS_NDADDR) { pref = ip->i_din1->di_ib[0]; if (pref != 0 && pref >= cgdata(fs, inocg) && pref < cgbase(fs, inocg + 1)) return (pref + fs->fs_frag); } /* * If we are at the beginning of a file, or we have already allocated * the maximum number of blocks per cylinder group, or we do not * have a block allocated immediately preceding us, then we need * to decide where to start allocating new blocks. */ if (indx % fs->fs_maxbpg == 0 || bap[indx - 1] == 0) { /* * If we are allocating a directory data block, we want * to place it in the metadata area. */ if ((ip->i_mode & IFMT) == IFDIR) return (cgmeta(fs, inocg)); /* * Until we fill all the direct and all the first indirect's * blocks, we try to allocate in the data area of the inode's * cylinder group. */ - if (lbn < NDADDR + NINDIR(fs)) + if (lbn < UFS_NDADDR + NINDIR(fs)) return (cgdata(fs, inocg)); /* * Find a cylinder with greater than average number of * unused data blocks. */ if (indx == 0 || bap[indx - 1] == 0) startcg = inocg + lbn / fs->fs_maxbpg; else startcg = dtog(fs, bap[indx - 1]) + 1; startcg %= fs->fs_ncg; avgbfree = fs->fs_cstotal.cs_nbfree / fs->fs_ncg; for (cg = startcg; cg < fs->fs_ncg; cg++) if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) { fs->fs_cgrotor = cg; return (cgdata(fs, cg)); } for (cg = 0; cg <= startcg; cg++) if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) { fs->fs_cgrotor = cg; return (cgdata(fs, cg)); } return (0); } /* * Otherwise, we just always try to lay things out contiguously. */ return (bap[indx - 1] + fs->fs_frag); } /* * Same as above, but for UFS2 */ ufs2_daddr_t ffs_blkpref_ufs2(ip, lbn, indx, bap) struct inode *ip; ufs_lbn_t lbn; int indx; ufs2_daddr_t *bap; { struct fs *fs; u_int cg, inocg; u_int avgbfree, startcg; ufs2_daddr_t pref; KASSERT(indx <= 0 || bap != NULL, ("need non-NULL bap")); mtx_assert(UFS_MTX(ITOUMP(ip)), MA_OWNED); fs = ITOFS(ip); /* * Allocation of indirect blocks is indicated by passing negative * values in indx: -1 for single indirect, -2 for double indirect, * -3 for triple indirect. As noted below, we attempt to allocate * the first indirect inline with the file data. For all later * indirect blocks, the data is often allocated in other cylinder * groups. However to speed random file access and to speed up * fsck, the filesystem reserves the first fs_metaspace blocks * (typically half of fs_minfree) of the data area of each cylinder * group to hold these later indirect blocks. */ inocg = ino_to_cg(fs, ip->i_number); if (indx < 0) { /* * Our preference for indirect blocks is the zone at the * beginning of the inode's cylinder group data area that * we try to reserve for indirect blocks. */ pref = cgmeta(fs, inocg); /* * If we are allocating the first indirect block, try to * place it immediately following the last direct block. */ - if (indx == -1 && lbn < NDADDR + NINDIR(fs) && - ip->i_din2->di_db[NDADDR - 1] != 0) - pref = ip->i_din2->di_db[NDADDR - 1] + fs->fs_frag; + if (indx == -1 && lbn < UFS_NDADDR + NINDIR(fs) && + ip->i_din2->di_db[UFS_NDADDR - 1] != 0) + pref = ip->i_din2->di_db[UFS_NDADDR - 1] + fs->fs_frag; return (pref); } /* * If we are allocating the first data block in the first indirect * block and the indirect has been allocated in the data block area, * try to place it immediately following the indirect block. */ - if (lbn == NDADDR) { + if (lbn == UFS_NDADDR) { pref = ip->i_din2->di_ib[0]; if (pref != 0 && pref >= cgdata(fs, inocg) && pref < cgbase(fs, inocg + 1)) return (pref + fs->fs_frag); } /* * If we are at the beginning of a file, or we have already allocated * the maximum number of blocks per cylinder group, or we do not * have a block allocated immediately preceding us, then we need * to decide where to start allocating new blocks. */ if (indx % fs->fs_maxbpg == 0 || bap[indx - 1] == 0) { /* * If we are allocating a directory data block, we want * to place it in the metadata area. */ if ((ip->i_mode & IFMT) == IFDIR) return (cgmeta(fs, inocg)); /* * Until we fill all the direct and all the first indirect's * blocks, we try to allocate in the data area of the inode's * cylinder group. */ - if (lbn < NDADDR + NINDIR(fs)) + if (lbn < UFS_NDADDR + NINDIR(fs)) return (cgdata(fs, inocg)); /* * Find a cylinder with greater than average number of * unused data blocks. */ if (indx == 0 || bap[indx - 1] == 0) startcg = inocg + lbn / fs->fs_maxbpg; else startcg = dtog(fs, bap[indx - 1]) + 1; startcg %= fs->fs_ncg; avgbfree = fs->fs_cstotal.cs_nbfree / fs->fs_ncg; for (cg = startcg; cg < fs->fs_ncg; cg++) if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) { fs->fs_cgrotor = cg; return (cgdata(fs, cg)); } for (cg = 0; cg <= startcg; cg++) if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) { fs->fs_cgrotor = cg; return (cgdata(fs, cg)); } return (0); } /* * Otherwise, we just always try to lay things out contiguously. */ return (bap[indx - 1] + fs->fs_frag); } /* * Implement the cylinder overflow algorithm. * * The policy implemented by this algorithm is: * 1) allocate the block in its requested cylinder group. * 2) quadradically rehash on the cylinder group number. * 3) brute force search for a free block. * * Must be called with the UFS lock held. Will release the lock on success * and return with it held on failure. */ /*VARARGS5*/ static ufs2_daddr_t ffs_hashalloc(ip, cg, pref, size, rsize, allocator) struct inode *ip; u_int cg; ufs2_daddr_t pref; int size; /* Search size for data blocks, mode for inodes */ int rsize; /* Real allocated size. */ allocfcn_t *allocator; { struct fs *fs; ufs2_daddr_t result; u_int i, icg = cg; mtx_assert(UFS_MTX(ITOUMP(ip)), MA_OWNED); #ifdef INVARIANTS if (ITOV(ip)->v_mount->mnt_kern_flag & MNTK_SUSPENDED) panic("ffs_hashalloc: allocation on suspended filesystem"); #endif fs = ITOFS(ip); /* * 1: preferred cylinder group */ result = (*allocator)(ip, cg, pref, size, rsize); if (result) return (result); /* * 2: quadratic rehash */ for (i = 1; i < fs->fs_ncg; i *= 2) { cg += i; if (cg >= fs->fs_ncg) cg -= fs->fs_ncg; result = (*allocator)(ip, cg, 0, size, rsize); if (result) return (result); } /* * 3: brute force search * Note that we start at i == 2, since 0 was checked initially, * and 1 is always checked in the quadratic rehash. */ cg = (icg + 2) % fs->fs_ncg; for (i = 2; i < fs->fs_ncg; i++) { result = (*allocator)(ip, cg, 0, size, rsize); if (result) return (result); cg++; if (cg == fs->fs_ncg) cg = 0; } return (0); } /* * Determine whether a fragment can be extended. * * Check to see if the necessary fragments are available, and * if they are, allocate them. */ static ufs2_daddr_t ffs_fragextend(ip, cg, bprev, osize, nsize) struct inode *ip; u_int cg; ufs2_daddr_t bprev; int osize, nsize; { struct fs *fs; struct cg *cgp; struct buf *bp; struct ufsmount *ump; int nffree; long bno; int frags, bbase; int i, error; u_int8_t *blksfree; ump = ITOUMP(ip); fs = ump->um_fs; if (fs->fs_cs(fs, cg).cs_nffree < numfrags(fs, nsize - osize)) return (0); frags = numfrags(fs, nsize); bbase = fragnum(fs, bprev); if (bbase > fragnum(fs, (bprev + frags - 1))) { /* cannot extend across a block boundary */ return (0); } UFS_UNLOCK(ump); error = bread(ump->um_devvp, fsbtodb(fs, cgtod(fs, cg)), (int)fs->fs_cgsize, NOCRED, &bp); if (error) goto fail; cgp = (struct cg *)bp->b_data; if (!cg_chkmagic(cgp)) goto fail; bp->b_xflags |= BX_BKGRDWRITE; cgp->cg_old_time = cgp->cg_time = time_second; bno = dtogd(fs, bprev); blksfree = cg_blksfree(cgp); for (i = numfrags(fs, osize); i < frags; i++) if (isclr(blksfree, bno + i)) goto fail; /* * the current fragment can be extended * deduct the count on fragment being extended into * increase the count on the remaining fragment (if any) * allocate the extended piece */ for (i = frags; i < fs->fs_frag - bbase; i++) if (isclr(blksfree, bno + i)) break; cgp->cg_frsum[i - numfrags(fs, osize)]--; if (i != frags) cgp->cg_frsum[i - frags]++; for (i = numfrags(fs, osize), nffree = 0; i < frags; i++) { clrbit(blksfree, bno + i); cgp->cg_cs.cs_nffree--; nffree++; } UFS_LOCK(ump); fs->fs_cstotal.cs_nffree -= nffree; fs->fs_cs(fs, cg).cs_nffree -= nffree; fs->fs_fmod = 1; ACTIVECLEAR(fs, cg); UFS_UNLOCK(ump); if (DOINGSOFTDEP(ITOV(ip))) softdep_setup_blkmapdep(bp, UFSTOVFS(ump), bprev, frags, numfrags(fs, osize)); bdwrite(bp); return (bprev); fail: brelse(bp); UFS_LOCK(ump); return (0); } /* * Determine whether a block can be allocated. * * Check to see if a block of the appropriate size is available, * and if it is, allocate it. */ static ufs2_daddr_t ffs_alloccg(ip, cg, bpref, size, rsize) struct inode *ip; u_int cg; ufs2_daddr_t bpref; int size; int rsize; { struct fs *fs; struct cg *cgp; struct buf *bp; struct ufsmount *ump; ufs1_daddr_t bno; ufs2_daddr_t blkno; int i, allocsiz, error, frags; u_int8_t *blksfree; ump = ITOUMP(ip); fs = ump->um_fs; if (fs->fs_cs(fs, cg).cs_nbfree == 0 && size == fs->fs_bsize) return (0); UFS_UNLOCK(ump); error = bread(ump->um_devvp, fsbtodb(fs, cgtod(fs, cg)), (int)fs->fs_cgsize, NOCRED, &bp); if (error) goto fail; cgp = (struct cg *)bp->b_data; if (!cg_chkmagic(cgp) || (cgp->cg_cs.cs_nbfree == 0 && size == fs->fs_bsize)) goto fail; bp->b_xflags |= BX_BKGRDWRITE; cgp->cg_old_time = cgp->cg_time = time_second; if (size == fs->fs_bsize) { UFS_LOCK(ump); blkno = ffs_alloccgblk(ip, bp, bpref, rsize); ACTIVECLEAR(fs, cg); UFS_UNLOCK(ump); bdwrite(bp); return (blkno); } /* * check to see if any fragments are already available * allocsiz is the size which will be allocated, hacking * it down to a smaller size if necessary */ blksfree = cg_blksfree(cgp); frags = numfrags(fs, size); for (allocsiz = frags; allocsiz < fs->fs_frag; allocsiz++) if (cgp->cg_frsum[allocsiz] != 0) break; if (allocsiz == fs->fs_frag) { /* * no fragments were available, so a block will be * allocated, and hacked up */ if (cgp->cg_cs.cs_nbfree == 0) goto fail; UFS_LOCK(ump); blkno = ffs_alloccgblk(ip, bp, bpref, rsize); ACTIVECLEAR(fs, cg); UFS_UNLOCK(ump); bdwrite(bp); return (blkno); } KASSERT(size == rsize, ("ffs_alloccg: size(%d) != rsize(%d)", size, rsize)); bno = ffs_mapsearch(fs, cgp, bpref, allocsiz); if (bno < 0) goto fail; for (i = 0; i < frags; i++) clrbit(blksfree, bno + i); cgp->cg_cs.cs_nffree -= frags; cgp->cg_frsum[allocsiz]--; if (frags != allocsiz) cgp->cg_frsum[allocsiz - frags]++; UFS_LOCK(ump); fs->fs_cstotal.cs_nffree -= frags; fs->fs_cs(fs, cg).cs_nffree -= frags; fs->fs_fmod = 1; blkno = cgbase(fs, cg) + bno; ACTIVECLEAR(fs, cg); UFS_UNLOCK(ump); if (DOINGSOFTDEP(ITOV(ip))) softdep_setup_blkmapdep(bp, UFSTOVFS(ump), blkno, frags, 0); bdwrite(bp); return (blkno); fail: brelse(bp); UFS_LOCK(ump); return (0); } /* * Allocate a block in a cylinder group. * * This algorithm implements the following policy: * 1) allocate the requested block. * 2) allocate a rotationally optimal block in the same cylinder. * 3) allocate the next available block on the block rotor for the * specified cylinder group. * Note that this routine only allocates fs_bsize blocks; these * blocks may be fragmented by the routine that allocates them. */ static ufs2_daddr_t ffs_alloccgblk(ip, bp, bpref, size) struct inode *ip; struct buf *bp; ufs2_daddr_t bpref; int size; { struct fs *fs; struct cg *cgp; struct ufsmount *ump; ufs1_daddr_t bno; ufs2_daddr_t blkno; u_int8_t *blksfree; int i, cgbpref; ump = ITOUMP(ip); fs = ump->um_fs; mtx_assert(UFS_MTX(ump), MA_OWNED); cgp = (struct cg *)bp->b_data; blksfree = cg_blksfree(cgp); if (bpref == 0) { bpref = cgbase(fs, cgp->cg_cgx) + cgp->cg_rotor + fs->fs_frag; } else if ((cgbpref = dtog(fs, bpref)) != cgp->cg_cgx) { /* map bpref to correct zone in this cg */ if (bpref < cgdata(fs, cgbpref)) bpref = cgmeta(fs, cgp->cg_cgx); else bpref = cgdata(fs, cgp->cg_cgx); } /* * if the requested block is available, use it */ bno = dtogd(fs, blknum(fs, bpref)); if (ffs_isblock(fs, blksfree, fragstoblks(fs, bno))) goto gotit; /* * Take the next available block in this cylinder group. */ bno = ffs_mapsearch(fs, cgp, bpref, (int)fs->fs_frag); if (bno < 0) return (0); /* Update cg_rotor only if allocated from the data zone */ if (bno >= dtogd(fs, cgdata(fs, cgp->cg_cgx))) cgp->cg_rotor = bno; gotit: blkno = fragstoblks(fs, bno); ffs_clrblock(fs, blksfree, (long)blkno); ffs_clusteracct(fs, cgp, blkno, -1); cgp->cg_cs.cs_nbfree--; fs->fs_cstotal.cs_nbfree--; fs->fs_cs(fs, cgp->cg_cgx).cs_nbfree--; fs->fs_fmod = 1; blkno = cgbase(fs, cgp->cg_cgx) + bno; /* * If the caller didn't want the whole block free the frags here. */ size = numfrags(fs, size); if (size != fs->fs_frag) { bno = dtogd(fs, blkno); for (i = size; i < fs->fs_frag; i++) setbit(blksfree, bno + i); i = fs->fs_frag - size; cgp->cg_cs.cs_nffree += i; fs->fs_cstotal.cs_nffree += i; fs->fs_cs(fs, cgp->cg_cgx).cs_nffree += i; fs->fs_fmod = 1; cgp->cg_frsum[i]++; } /* XXX Fixme. */ UFS_UNLOCK(ump); if (DOINGSOFTDEP(ITOV(ip))) softdep_setup_blkmapdep(bp, UFSTOVFS(ump), blkno, size, 0); UFS_LOCK(ump); return (blkno); } /* * Determine whether a cluster can be allocated. * * We do not currently check for optimal rotational layout if there * are multiple choices in the same cylinder group. Instead we just * take the first one that we find following bpref. */ static ufs2_daddr_t ffs_clusteralloc(ip, cg, bpref, len) struct inode *ip; u_int cg; ufs2_daddr_t bpref; int len; { struct fs *fs; struct cg *cgp; struct buf *bp; struct ufsmount *ump; int i, run, bit, map, got; ufs2_daddr_t bno; u_char *mapp; int32_t *lp; u_int8_t *blksfree; ump = ITOUMP(ip); fs = ump->um_fs; if (fs->fs_maxcluster[cg] < len) return (0); UFS_UNLOCK(ump); if (bread(ump->um_devvp, fsbtodb(fs, cgtod(fs, cg)), (int)fs->fs_cgsize, NOCRED, &bp)) goto fail_lock; cgp = (struct cg *)bp->b_data; if (!cg_chkmagic(cgp)) goto fail_lock; bp->b_xflags |= BX_BKGRDWRITE; /* * Check to see if a cluster of the needed size (or bigger) is * available in this cylinder group. */ lp = &cg_clustersum(cgp)[len]; for (i = len; i <= fs->fs_contigsumsize; i++) if (*lp++ > 0) break; if (i > fs->fs_contigsumsize) { /* * This is the first time looking for a cluster in this * cylinder group. Update the cluster summary information * to reflect the true maximum sized cluster so that * future cluster allocation requests can avoid reading * the cylinder group map only to find no clusters. */ lp = &cg_clustersum(cgp)[len - 1]; for (i = len - 1; i > 0; i--) if (*lp-- > 0) break; UFS_LOCK(ump); fs->fs_maxcluster[cg] = i; goto fail; } /* * Search the cluster map to find a big enough cluster. * We take the first one that we find, even if it is larger * than we need as we prefer to get one close to the previous * block allocation. We do not search before the current * preference point as we do not want to allocate a block * that is allocated before the previous one (as we will * then have to wait for another pass of the elevator * algorithm before it will be read). We prefer to fail and * be recalled to try an allocation in the next cylinder group. */ if (dtog(fs, bpref) != cg) bpref = cgdata(fs, cg); else bpref = blknum(fs, bpref); bpref = fragstoblks(fs, dtogd(fs, bpref)); mapp = &cg_clustersfree(cgp)[bpref / NBBY]; map = *mapp++; bit = 1 << (bpref % NBBY); for (run = 0, got = bpref; got < cgp->cg_nclusterblks; got++) { if ((map & bit) == 0) { run = 0; } else { run++; if (run == len) break; } if ((got & (NBBY - 1)) != (NBBY - 1)) { bit <<= 1; } else { map = *mapp++; bit = 1; } } if (got >= cgp->cg_nclusterblks) goto fail_lock; /* * Allocate the cluster that we have found. */ blksfree = cg_blksfree(cgp); for (i = 1; i <= len; i++) if (!ffs_isblock(fs, blksfree, got - run + i)) panic("ffs_clusteralloc: map mismatch"); bno = cgbase(fs, cg) + blkstofrags(fs, got - run + 1); if (dtog(fs, bno) != cg) panic("ffs_clusteralloc: allocated out of group"); len = blkstofrags(fs, len); UFS_LOCK(ump); for (i = 0; i < len; i += fs->fs_frag) if (ffs_alloccgblk(ip, bp, bno + i, fs->fs_bsize) != bno + i) panic("ffs_clusteralloc: lost block"); ACTIVECLEAR(fs, cg); UFS_UNLOCK(ump); bdwrite(bp); return (bno); fail_lock: UFS_LOCK(ump); fail: brelse(bp); return (0); } static inline struct buf * getinobuf(struct inode *ip, u_int cg, u_int32_t cginoblk, int gbflags) { struct fs *fs; fs = ITOFS(ip); return (getblk(ITODEVVP(ip), fsbtodb(fs, ino_to_fsba(fs, cg * fs->fs_ipg + cginoblk)), (int)fs->fs_bsize, 0, 0, gbflags)); } /* * Determine whether an inode can be allocated. * * Check to see if an inode is available, and if it is, * allocate it using the following policy: * 1) allocate the requested inode. * 2) allocate the next available inode after the requested * inode in the specified cylinder group. */ static ufs2_daddr_t ffs_nodealloccg(ip, cg, ipref, mode, unused) struct inode *ip; u_int cg; ufs2_daddr_t ipref; int mode; int unused; { struct fs *fs; struct cg *cgp; struct buf *bp, *ibp; struct ufsmount *ump; u_int8_t *inosused, *loc; struct ufs2_dinode *dp2; int error, start, len, i; u_int32_t old_initediblk; ump = ITOUMP(ip); fs = ump->um_fs; check_nifree: if (fs->fs_cs(fs, cg).cs_nifree == 0) return (0); UFS_UNLOCK(ump); error = bread(ump->um_devvp, fsbtodb(fs, cgtod(fs, cg)), (int)fs->fs_cgsize, NOCRED, &bp); if (error) { brelse(bp); UFS_LOCK(ump); return (0); } cgp = (struct cg *)bp->b_data; restart: if (!cg_chkmagic(cgp) || cgp->cg_cs.cs_nifree == 0) { brelse(bp); UFS_LOCK(ump); return (0); } bp->b_xflags |= BX_BKGRDWRITE; inosused = cg_inosused(cgp); if (ipref) { ipref %= fs->fs_ipg; if (isclr(inosused, ipref)) goto gotit; } start = cgp->cg_irotor / NBBY; len = howmany(fs->fs_ipg - cgp->cg_irotor, NBBY); loc = memcchr(&inosused[start], 0xff, len); if (loc == NULL) { len = start + 1; start = 0; loc = memcchr(&inosused[start], 0xff, len); if (loc == NULL) { printf("cg = %d, irotor = %ld, fs = %s\n", cg, (long)cgp->cg_irotor, fs->fs_fsmnt); panic("ffs_nodealloccg: map corrupted"); /* NOTREACHED */ } } ipref = (loc - inosused) * NBBY + ffs(~*loc) - 1; gotit: /* * Check to see if we need to initialize more inodes. */ if (fs->fs_magic == FS_UFS2_MAGIC && ipref + INOPB(fs) > cgp->cg_initediblk && cgp->cg_initediblk < cgp->cg_niblk) { old_initediblk = cgp->cg_initediblk; /* * Free the cylinder group lock before writing the * initialized inode block. Entering the * babarrierwrite() with the cylinder group lock * causes lock order violation between the lock and * snaplk. * * Another thread can decide to initialize the same * inode block, but whichever thread first gets the * cylinder group lock after writing the newly * allocated inode block will update it and the other * will realize that it has lost and leave the * cylinder group unchanged. */ ibp = getinobuf(ip, cg, old_initediblk, GB_LOCK_NOWAIT); brelse(bp); if (ibp == NULL) { /* * The inode block buffer is already owned by * another thread, which must initialize it. * Wait on the buffer to allow another thread * to finish the updates, with dropped cg * buffer lock, then retry. */ ibp = getinobuf(ip, cg, old_initediblk, 0); brelse(ibp); UFS_LOCK(ump); goto check_nifree; } bzero(ibp->b_data, (int)fs->fs_bsize); dp2 = (struct ufs2_dinode *)(ibp->b_data); for (i = 0; i < INOPB(fs); i++) { while (dp2->di_gen == 0) dp2->di_gen = arc4random(); dp2++; } /* * Rather than adding a soft updates dependency to ensure * that the new inode block is written before it is claimed * by the cylinder group map, we just do a barrier write * here. The barrier write will ensure that the inode block * gets written before the updated cylinder group map can be * written. The barrier write should only slow down bulk * loading of newly created filesystems. */ babarrierwrite(ibp); /* * After the inode block is written, try to update the * cg initediblk pointer. If another thread beat us * to it, then leave it unchanged as the other thread * has already set it correctly. */ error = bread(ump->um_devvp, fsbtodb(fs, cgtod(fs, cg)), (int)fs->fs_cgsize, NOCRED, &bp); UFS_LOCK(ump); ACTIVECLEAR(fs, cg); UFS_UNLOCK(ump); if (error != 0) { brelse(bp); return (error); } cgp = (struct cg *)bp->b_data; if (cgp->cg_initediblk == old_initediblk) cgp->cg_initediblk += INOPB(fs); goto restart; } cgp->cg_old_time = cgp->cg_time = time_second; cgp->cg_irotor = ipref; UFS_LOCK(ump); ACTIVECLEAR(fs, cg); setbit(inosused, ipref); cgp->cg_cs.cs_nifree--; fs->fs_cstotal.cs_nifree--; fs->fs_cs(fs, cg).cs_nifree--; fs->fs_fmod = 1; if ((mode & IFMT) == IFDIR) { cgp->cg_cs.cs_ndir++; fs->fs_cstotal.cs_ndir++; fs->fs_cs(fs, cg).cs_ndir++; } UFS_UNLOCK(ump); if (DOINGSOFTDEP(ITOV(ip))) softdep_setup_inomapdep(bp, ip, cg * fs->fs_ipg + ipref, mode); bdwrite(bp); return ((ino_t)(cg * fs->fs_ipg + ipref)); } /* * Free a block or fragment. * * The specified block or fragment is placed back in the * free map. If a fragment is deallocated, a possible * block reassembly is checked. */ static void ffs_blkfree_cg(ump, fs, devvp, bno, size, inum, dephd) struct ufsmount *ump; struct fs *fs; struct vnode *devvp; ufs2_daddr_t bno; long size; ino_t inum; struct workhead *dephd; { struct mount *mp; struct cg *cgp; struct buf *bp; ufs1_daddr_t fragno, cgbno; ufs2_daddr_t cgblkno; int i, blk, frags, bbase; u_int cg; u_int8_t *blksfree; struct cdev *dev; cg = dtog(fs, bno); if (devvp->v_type == VREG) { /* devvp is a snapshot */ MPASS(devvp->v_mount->mnt_data == ump); dev = ump->um_devvp->v_rdev; cgblkno = fragstoblks(fs, cgtod(fs, cg)); } else if (devvp->v_type == VCHR) { /* devvp is a normal disk device */ dev = devvp->v_rdev; cgblkno = fsbtodb(fs, cgtod(fs, cg)); ASSERT_VOP_LOCKED(devvp, "ffs_blkfree_cg"); } else return; #ifdef INVARIANTS if ((u_int)size > fs->fs_bsize || fragoff(fs, size) != 0 || fragnum(fs, bno) + numfrags(fs, size) > fs->fs_frag) { printf("dev=%s, bno = %jd, bsize = %ld, size = %ld, fs = %s\n", devtoname(dev), (intmax_t)bno, (long)fs->fs_bsize, size, fs->fs_fsmnt); panic("ffs_blkfree_cg: bad size"); } #endif if ((u_int)bno >= fs->fs_size) { printf("bad block %jd, ino %lu\n", (intmax_t)bno, (u_long)inum); ffs_fserr(fs, inum, "bad block"); return; } if (bread(devvp, cgblkno, (int)fs->fs_cgsize, NOCRED, &bp)) { brelse(bp); return; } cgp = (struct cg *)bp->b_data; if (!cg_chkmagic(cgp)) { brelse(bp); return; } bp->b_xflags |= BX_BKGRDWRITE; cgp->cg_old_time = cgp->cg_time = time_second; cgbno = dtogd(fs, bno); blksfree = cg_blksfree(cgp); UFS_LOCK(ump); if (size == fs->fs_bsize) { fragno = fragstoblks(fs, cgbno); if (!ffs_isfreeblock(fs, blksfree, fragno)) { if (devvp->v_type == VREG) { UFS_UNLOCK(ump); /* devvp is a snapshot */ brelse(bp); return; } printf("dev = %s, block = %jd, fs = %s\n", devtoname(dev), (intmax_t)bno, fs->fs_fsmnt); panic("ffs_blkfree_cg: freeing free block"); } ffs_setblock(fs, blksfree, fragno); ffs_clusteracct(fs, cgp, fragno, 1); cgp->cg_cs.cs_nbfree++; fs->fs_cstotal.cs_nbfree++; fs->fs_cs(fs, cg).cs_nbfree++; } else { bbase = cgbno - fragnum(fs, cgbno); /* * decrement the counts associated with the old frags */ blk = blkmap(fs, blksfree, bbase); ffs_fragacct(fs, blk, cgp->cg_frsum, -1); /* * deallocate the fragment */ frags = numfrags(fs, size); for (i = 0; i < frags; i++) { if (isset(blksfree, cgbno + i)) { printf("dev = %s, block = %jd, fs = %s\n", devtoname(dev), (intmax_t)(bno + i), fs->fs_fsmnt); panic("ffs_blkfree_cg: freeing free frag"); } setbit(blksfree, cgbno + i); } cgp->cg_cs.cs_nffree += i; fs->fs_cstotal.cs_nffree += i; fs->fs_cs(fs, cg).cs_nffree += i; /* * add back in counts associated with the new frags */ blk = blkmap(fs, blksfree, bbase); ffs_fragacct(fs, blk, cgp->cg_frsum, 1); /* * if a complete block has been reassembled, account for it */ fragno = fragstoblks(fs, bbase); if (ffs_isblock(fs, blksfree, fragno)) { cgp->cg_cs.cs_nffree -= fs->fs_frag; fs->fs_cstotal.cs_nffree -= fs->fs_frag; fs->fs_cs(fs, cg).cs_nffree -= fs->fs_frag; ffs_clusteracct(fs, cgp, fragno, 1); cgp->cg_cs.cs_nbfree++; fs->fs_cstotal.cs_nbfree++; fs->fs_cs(fs, cg).cs_nbfree++; } } fs->fs_fmod = 1; ACTIVECLEAR(fs, cg); UFS_UNLOCK(ump); mp = UFSTOVFS(ump); if (MOUNTEDSOFTDEP(mp) && devvp->v_type == VCHR) softdep_setup_blkfree(UFSTOVFS(ump), bp, bno, numfrags(fs, size), dephd); bdwrite(bp); } struct ffs_blkfree_trim_params { struct task task; struct ufsmount *ump; struct vnode *devvp; ufs2_daddr_t bno; long size; ino_t inum; struct workhead *pdephd; struct workhead dephd; }; static void ffs_blkfree_trim_task(ctx, pending) void *ctx; int pending; { struct ffs_blkfree_trim_params *tp; tp = ctx; ffs_blkfree_cg(tp->ump, tp->ump->um_fs, tp->devvp, tp->bno, tp->size, tp->inum, tp->pdephd); vn_finished_secondary_write(UFSTOVFS(tp->ump)); atomic_add_int(&tp->ump->um_trim_inflight, -1); free(tp, M_TEMP); } static void ffs_blkfree_trim_completed(bip) struct bio *bip; { struct ffs_blkfree_trim_params *tp; tp = bip->bio_caller2; g_destroy_bio(bip); TASK_INIT(&tp->task, 0, ffs_blkfree_trim_task, tp); taskqueue_enqueue(tp->ump->um_trim_tq, &tp->task); } void ffs_blkfree(ump, fs, devvp, bno, size, inum, vtype, dephd) struct ufsmount *ump; struct fs *fs; struct vnode *devvp; ufs2_daddr_t bno; long size; ino_t inum; enum vtype vtype; struct workhead *dephd; { struct mount *mp; struct bio *bip; struct ffs_blkfree_trim_params *tp; /* * Check to see if a snapshot wants to claim the block. * Check that devvp is a normal disk device, not a snapshot, * it has a snapshot(s) associated with it, and one of the * snapshots wants to claim the block. */ if (devvp->v_type == VCHR && (devvp->v_vflag & VV_COPYONWRITE) && ffs_snapblkfree(fs, devvp, bno, size, inum, vtype, dephd)) { return; } /* * Nothing to delay if TRIM is disabled, or the operation is * performed on the snapshot. */ if (!ump->um_candelete || devvp->v_type == VREG) { ffs_blkfree_cg(ump, fs, devvp, bno, size, inum, dephd); return; } /* * Postpone the set of the free bit in the cg bitmap until the * BIO_DELETE is completed. Otherwise, due to disk queue * reordering, TRIM might be issued after we reuse the block * and write some new data into it. */ atomic_add_int(&ump->um_trim_inflight, 1); tp = malloc(sizeof(struct ffs_blkfree_trim_params), M_TEMP, M_WAITOK); tp->ump = ump; tp->devvp = devvp; tp->bno = bno; tp->size = size; tp->inum = inum; if (dephd != NULL) { LIST_INIT(&tp->dephd); LIST_SWAP(dephd, &tp->dephd, worklist, wk_list); tp->pdephd = &tp->dephd; } else tp->pdephd = NULL; bip = g_alloc_bio(); bip->bio_cmd = BIO_DELETE; bip->bio_offset = dbtob(fsbtodb(fs, bno)); bip->bio_done = ffs_blkfree_trim_completed; bip->bio_length = size; bip->bio_caller2 = tp; mp = UFSTOVFS(ump); vn_start_secondary_write(NULL, &mp, 0); g_io_request(bip, (struct g_consumer *)devvp->v_bufobj.bo_private); } #ifdef INVARIANTS /* * Verify allocation of a block or fragment. Returns true if block or * fragment is allocated, false if it is free. */ static int ffs_checkblk(ip, bno, size) struct inode *ip; ufs2_daddr_t bno; long size; { struct fs *fs; struct cg *cgp; struct buf *bp; ufs1_daddr_t cgbno; int i, error, frags, free; u_int8_t *blksfree; fs = ITOFS(ip); if ((u_int)size > fs->fs_bsize || fragoff(fs, size) != 0) { printf("bsize = %ld, size = %ld, fs = %s\n", (long)fs->fs_bsize, size, fs->fs_fsmnt); panic("ffs_checkblk: bad size"); } if ((u_int)bno >= fs->fs_size) panic("ffs_checkblk: bad block %jd", (intmax_t)bno); error = bread(ITODEVVP(ip), fsbtodb(fs, cgtod(fs, dtog(fs, bno))), (int)fs->fs_cgsize, NOCRED, &bp); if (error) panic("ffs_checkblk: cg bread failed"); cgp = (struct cg *)bp->b_data; if (!cg_chkmagic(cgp)) panic("ffs_checkblk: cg magic mismatch"); bp->b_xflags |= BX_BKGRDWRITE; blksfree = cg_blksfree(cgp); cgbno = dtogd(fs, bno); if (size == fs->fs_bsize) { free = ffs_isblock(fs, blksfree, fragstoblks(fs, cgbno)); } else { frags = numfrags(fs, size); for (free = 0, i = 0; i < frags; i++) if (isset(blksfree, cgbno + i)) free++; if (free != 0 && free != frags) panic("ffs_checkblk: partially free fragment"); } brelse(bp); return (!free); } #endif /* INVARIANTS */ /* * Free an inode. */ int ffs_vfree(pvp, ino, mode) struct vnode *pvp; ino_t ino; int mode; { struct ufsmount *ump; struct inode *ip; if (DOINGSOFTDEP(pvp)) { softdep_freefile(pvp, ino, mode); return (0); } ip = VTOI(pvp); ump = VFSTOUFS(pvp->v_mount); return (ffs_freefile(ump, ump->um_fs, ump->um_devvp, ino, mode, NULL)); } /* * Do the actual free operation. * The specified inode is placed back in the free map. */ int ffs_freefile(ump, fs, devvp, ino, mode, wkhd) struct ufsmount *ump; struct fs *fs; struct vnode *devvp; ino_t ino; int mode; struct workhead *wkhd; { struct cg *cgp; struct buf *bp; ufs2_daddr_t cgbno; int error; u_int cg; u_int8_t *inosused; struct cdev *dev; cg = ino_to_cg(fs, ino); if (devvp->v_type == VREG) { /* devvp is a snapshot */ MPASS(devvp->v_mount->mnt_data == ump); dev = ump->um_devvp->v_rdev; cgbno = fragstoblks(fs, cgtod(fs, cg)); } else if (devvp->v_type == VCHR) { /* devvp is a normal disk device */ dev = devvp->v_rdev; cgbno = fsbtodb(fs, cgtod(fs, cg)); } else { bp = NULL; return (0); } if (ino >= fs->fs_ipg * fs->fs_ncg) panic("ffs_freefile: range: dev = %s, ino = %ju, fs = %s", devtoname(dev), (uintmax_t)ino, fs->fs_fsmnt); if ((error = bread(devvp, cgbno, (int)fs->fs_cgsize, NOCRED, &bp))) { brelse(bp); return (error); } cgp = (struct cg *)bp->b_data; if (!cg_chkmagic(cgp)) { brelse(bp); return (0); } bp->b_xflags |= BX_BKGRDWRITE; cgp->cg_old_time = cgp->cg_time = time_second; inosused = cg_inosused(cgp); ino %= fs->fs_ipg; if (isclr(inosused, ino)) { printf("dev = %s, ino = %ju, fs = %s\n", devtoname(dev), (uintmax_t)(ino + cg * fs->fs_ipg), fs->fs_fsmnt); if (fs->fs_ronly == 0) panic("ffs_freefile: freeing free inode"); } clrbit(inosused, ino); if (ino < cgp->cg_irotor) cgp->cg_irotor = ino; cgp->cg_cs.cs_nifree++; UFS_LOCK(ump); fs->fs_cstotal.cs_nifree++; fs->fs_cs(fs, cg).cs_nifree++; if ((mode & IFMT) == IFDIR) { cgp->cg_cs.cs_ndir--; fs->fs_cstotal.cs_ndir--; fs->fs_cs(fs, cg).cs_ndir--; } fs->fs_fmod = 1; ACTIVECLEAR(fs, cg); UFS_UNLOCK(ump); if (MOUNTEDSOFTDEP(UFSTOVFS(ump)) && devvp->v_type == VCHR) softdep_setup_inofree(UFSTOVFS(ump), bp, ino + cg * fs->fs_ipg, wkhd); bdwrite(bp); return (0); } /* * Check to see if a file is free. */ int ffs_checkfreefile(fs, devvp, ino) struct fs *fs; struct vnode *devvp; ino_t ino; { struct cg *cgp; struct buf *bp; ufs2_daddr_t cgbno; int ret; u_int cg; u_int8_t *inosused; cg = ino_to_cg(fs, ino); if (devvp->v_type == VREG) { /* devvp is a snapshot */ cgbno = fragstoblks(fs, cgtod(fs, cg)); } else if (devvp->v_type == VCHR) { /* devvp is a normal disk device */ cgbno = fsbtodb(fs, cgtod(fs, cg)); } else { return (1); } if (ino >= fs->fs_ipg * fs->fs_ncg) return (1); if (bread(devvp, cgbno, (int)fs->fs_cgsize, NOCRED, &bp)) { brelse(bp); return (1); } cgp = (struct cg *)bp->b_data; if (!cg_chkmagic(cgp)) { brelse(bp); return (1); } inosused = cg_inosused(cgp); ino %= fs->fs_ipg; ret = isclr(inosused, ino); brelse(bp); return (ret); } /* * Find a block of the specified size in the specified cylinder group. * * It is a panic if a request is made to find a block if none are * available. */ static ufs1_daddr_t ffs_mapsearch(fs, cgp, bpref, allocsiz) struct fs *fs; struct cg *cgp; ufs2_daddr_t bpref; int allocsiz; { ufs1_daddr_t bno; int start, len, loc, i; int blk, field, subfield, pos; u_int8_t *blksfree; /* * find the fragment by searching through the free block * map for an appropriate bit pattern */ if (bpref) start = dtogd(fs, bpref) / NBBY; else start = cgp->cg_frotor / NBBY; blksfree = cg_blksfree(cgp); len = howmany(fs->fs_fpg, NBBY) - start; loc = scanc((u_int)len, (u_char *)&blksfree[start], fragtbl[fs->fs_frag], (u_char)(1 << (allocsiz - 1 + (fs->fs_frag % NBBY)))); if (loc == 0) { len = start + 1; start = 0; loc = scanc((u_int)len, (u_char *)&blksfree[0], fragtbl[fs->fs_frag], (u_char)(1 << (allocsiz - 1 + (fs->fs_frag % NBBY)))); if (loc == 0) { printf("start = %d, len = %d, fs = %s\n", start, len, fs->fs_fsmnt); panic("ffs_alloccg: map corrupted"); /* NOTREACHED */ } } bno = (start + len - loc) * NBBY; cgp->cg_frotor = bno; /* * found the byte in the map * sift through the bits to find the selected frag */ for (i = bno + NBBY; bno < i; bno += fs->fs_frag) { blk = blkmap(fs, blksfree, bno); blk <<= 1; field = around[allocsiz]; subfield = inside[allocsiz]; for (pos = 0; pos <= fs->fs_frag - allocsiz; pos++) { if ((blk & field) == subfield) return (bno + pos); field <<= 1; subfield <<= 1; } } printf("bno = %lu, fs = %s\n", (u_long)bno, fs->fs_fsmnt); panic("ffs_alloccg: block not in map"); return (-1); } /* * Fserr prints the name of a filesystem with an error diagnostic. * * The form of the error message is: * fs: error message */ void ffs_fserr(fs, inum, cp) struct fs *fs; ino_t inum; char *cp; { struct thread *td = curthread; /* XXX */ struct proc *p = td->td_proc; log(LOG_ERR, "pid %d (%s), uid %d inumber %ju on %s: %s\n", p->p_pid, p->p_comm, td->td_ucred->cr_uid, (uintmax_t)inum, fs->fs_fsmnt, cp); } /* * This function provides the capability for the fsck program to * update an active filesystem. Fourteen operations are provided: * * adjrefcnt(inode, amt) - adjusts the reference count on the * specified inode by the specified amount. Under normal * operation the count should always go down. Decrementing * the count to zero will cause the inode to be freed. * adjblkcnt(inode, amt) - adjust the number of blocks used by the * inode by the specified amount. * adjndir, adjbfree, adjifree, adjffree, adjnumclusters(amt) - * adjust the superblock summary. * freedirs(inode, count) - directory inodes [inode..inode + count - 1] * are marked as free. Inodes should never have to be marked * as in use. * freefiles(inode, count) - file inodes [inode..inode + count - 1] * are marked as free. Inodes should never have to be marked * as in use. * freeblks(blockno, size) - blocks [blockno..blockno + size - 1] * are marked as free. Blocks should never have to be marked * as in use. * setflags(flags, set/clear) - the fs_flags field has the specified * flags set (second parameter +1) or cleared (second parameter -1). * setcwd(dirinode) - set the current directory to dirinode in the * filesystem associated with the snapshot. * setdotdot(oldvalue, newvalue) - Verify that the inode number for ".." * in the current directory is oldvalue then change it to newvalue. * unlink(nameptr, oldvalue) - Verify that the inode number associated * with nameptr in the current directory is oldvalue then unlink it. * * The following functions may only be used on a quiescent filesystem * by the soft updates journal. They are not safe to be run on an active * filesystem. * * setinode(inode, dip) - the specified disk inode is replaced with the * contents pointed to by dip. * setbufoutput(fd, flags) - output associated with the specified file * descriptor (which must reference the character device supporting * the filesystem) switches from using physio to running through the * buffer cache when flags is set to 1. The descriptor reverts to * physio for output when flags is set to zero. */ static int sysctl_ffs_fsck(SYSCTL_HANDLER_ARGS); SYSCTL_PROC(_vfs_ffs, FFS_ADJ_REFCNT, adjrefcnt, CTLFLAG_WR|CTLTYPE_STRUCT, 0, 0, sysctl_ffs_fsck, "S,fsck", "Adjust Inode Reference Count"); static SYSCTL_NODE(_vfs_ffs, FFS_ADJ_BLKCNT, adjblkcnt, CTLFLAG_WR, sysctl_ffs_fsck, "Adjust Inode Used Blocks Count"); static SYSCTL_NODE(_vfs_ffs, FFS_ADJ_NDIR, adjndir, CTLFLAG_WR, sysctl_ffs_fsck, "Adjust number of directories"); static SYSCTL_NODE(_vfs_ffs, FFS_ADJ_NBFREE, adjnbfree, CTLFLAG_WR, sysctl_ffs_fsck, "Adjust number of free blocks"); static SYSCTL_NODE(_vfs_ffs, FFS_ADJ_NIFREE, adjnifree, CTLFLAG_WR, sysctl_ffs_fsck, "Adjust number of free inodes"); static SYSCTL_NODE(_vfs_ffs, FFS_ADJ_NFFREE, adjnffree, CTLFLAG_WR, sysctl_ffs_fsck, "Adjust number of free frags"); static SYSCTL_NODE(_vfs_ffs, FFS_ADJ_NUMCLUSTERS, adjnumclusters, CTLFLAG_WR, sysctl_ffs_fsck, "Adjust number of free clusters"); static SYSCTL_NODE(_vfs_ffs, FFS_DIR_FREE, freedirs, CTLFLAG_WR, sysctl_ffs_fsck, "Free Range of Directory Inodes"); static SYSCTL_NODE(_vfs_ffs, FFS_FILE_FREE, freefiles, CTLFLAG_WR, sysctl_ffs_fsck, "Free Range of File Inodes"); static SYSCTL_NODE(_vfs_ffs, FFS_BLK_FREE, freeblks, CTLFLAG_WR, sysctl_ffs_fsck, "Free Range of Blocks"); static SYSCTL_NODE(_vfs_ffs, FFS_SET_FLAGS, setflags, CTLFLAG_WR, sysctl_ffs_fsck, "Change Filesystem Flags"); static SYSCTL_NODE(_vfs_ffs, FFS_SET_CWD, setcwd, CTLFLAG_WR, sysctl_ffs_fsck, "Set Current Working Directory"); static SYSCTL_NODE(_vfs_ffs, FFS_SET_DOTDOT, setdotdot, CTLFLAG_WR, sysctl_ffs_fsck, "Change Value of .. Entry"); static SYSCTL_NODE(_vfs_ffs, FFS_UNLINK, unlink, CTLFLAG_WR, sysctl_ffs_fsck, "Unlink a Duplicate Name"); static SYSCTL_NODE(_vfs_ffs, FFS_SET_INODE, setinode, CTLFLAG_WR, sysctl_ffs_fsck, "Update an On-Disk Inode"); static SYSCTL_NODE(_vfs_ffs, FFS_SET_BUFOUTPUT, setbufoutput, CTLFLAG_WR, sysctl_ffs_fsck, "Set Buffered Writing for Descriptor"); #define DEBUG 1 #ifdef DEBUG static int fsckcmds = 0; SYSCTL_INT(_debug, OID_AUTO, fsckcmds, CTLFLAG_RW, &fsckcmds, 0, ""); #endif /* DEBUG */ static int buffered_write(struct file *, struct uio *, struct ucred *, int, struct thread *); static int sysctl_ffs_fsck(SYSCTL_HANDLER_ARGS) { struct thread *td = curthread; struct fsck_cmd cmd; struct ufsmount *ump; struct vnode *vp, *dvp, *fdvp; struct inode *ip, *dp; struct mount *mp; struct fs *fs; ufs2_daddr_t blkno; long blkcnt, blksize; struct file *fp, *vfp; cap_rights_t rights; int filetype, error; static struct fileops *origops, bufferedops; if (req->newlen > sizeof cmd) return (EBADRPC); if ((error = SYSCTL_IN(req, &cmd, sizeof cmd)) != 0) return (error); if (cmd.version != FFS_CMD_VERSION) return (ERPCMISMATCH); if ((error = getvnode(td, cmd.handle, cap_rights_init(&rights, CAP_FSCK), &fp)) != 0) return (error); vp = fp->f_data; if (vp->v_type != VREG && vp->v_type != VDIR) { fdrop(fp, td); return (EINVAL); } vn_start_write(vp, &mp, V_WAIT); if (mp == NULL || strncmp(mp->mnt_stat.f_fstypename, "ufs", MFSNAMELEN)) { vn_finished_write(mp); fdrop(fp, td); return (EINVAL); } ump = VFSTOUFS(mp); if ((mp->mnt_flag & MNT_RDONLY) && ump->um_fsckpid != td->td_proc->p_pid) { vn_finished_write(mp); fdrop(fp, td); return (EROFS); } fs = ump->um_fs; filetype = IFREG; switch (oidp->oid_number) { case FFS_SET_FLAGS: #ifdef DEBUG if (fsckcmds) printf("%s: %s flags\n", mp->mnt_stat.f_mntonname, cmd.size > 0 ? "set" : "clear"); #endif /* DEBUG */ if (cmd.size > 0) fs->fs_flags |= (long)cmd.value; else fs->fs_flags &= ~(long)cmd.value; break; case FFS_ADJ_REFCNT: #ifdef DEBUG if (fsckcmds) { printf("%s: adjust inode %jd link count by %jd\n", mp->mnt_stat.f_mntonname, (intmax_t)cmd.value, (intmax_t)cmd.size); } #endif /* DEBUG */ if ((error = ffs_vget(mp, (ino_t)cmd.value, LK_EXCLUSIVE, &vp))) break; ip = VTOI(vp); ip->i_nlink += cmd.size; DIP_SET(ip, i_nlink, ip->i_nlink); ip->i_effnlink += cmd.size; ip->i_flag |= IN_CHANGE | IN_MODIFIED; error = ffs_update(vp, 1); if (DOINGSOFTDEP(vp)) softdep_change_linkcnt(ip); vput(vp); break; case FFS_ADJ_BLKCNT: #ifdef DEBUG if (fsckcmds) { printf("%s: adjust inode %jd block count by %jd\n", mp->mnt_stat.f_mntonname, (intmax_t)cmd.value, (intmax_t)cmd.size); } #endif /* DEBUG */ if ((error = ffs_vget(mp, (ino_t)cmd.value, LK_EXCLUSIVE, &vp))) break; ip = VTOI(vp); DIP_SET(ip, i_blocks, DIP(ip, i_blocks) + cmd.size); ip->i_flag |= IN_CHANGE | IN_MODIFIED; error = ffs_update(vp, 1); vput(vp); break; case FFS_DIR_FREE: filetype = IFDIR; /* fall through */ case FFS_FILE_FREE: #ifdef DEBUG if (fsckcmds) { if (cmd.size == 1) printf("%s: free %s inode %ju\n", mp->mnt_stat.f_mntonname, filetype == IFDIR ? "directory" : "file", (uintmax_t)cmd.value); else printf("%s: free %s inodes %ju-%ju\n", mp->mnt_stat.f_mntonname, filetype == IFDIR ? "directory" : "file", (uintmax_t)cmd.value, (uintmax_t)(cmd.value + cmd.size - 1)); } #endif /* DEBUG */ while (cmd.size > 0) { if ((error = ffs_freefile(ump, fs, ump->um_devvp, cmd.value, filetype, NULL))) break; cmd.size -= 1; cmd.value += 1; } break; case FFS_BLK_FREE: #ifdef DEBUG if (fsckcmds) { if (cmd.size == 1) printf("%s: free block %jd\n", mp->mnt_stat.f_mntonname, (intmax_t)cmd.value); else printf("%s: free blocks %jd-%jd\n", mp->mnt_stat.f_mntonname, (intmax_t)cmd.value, (intmax_t)cmd.value + cmd.size - 1); } #endif /* DEBUG */ blkno = cmd.value; blkcnt = cmd.size; blksize = fs->fs_frag - (blkno % fs->fs_frag); while (blkcnt > 0) { if (blksize > blkcnt) blksize = blkcnt; ffs_blkfree(ump, fs, ump->um_devvp, blkno, - blksize * fs->fs_fsize, ROOTINO, VDIR, NULL); + blksize * fs->fs_fsize, UFS_ROOTINO, VDIR, NULL); blkno += blksize; blkcnt -= blksize; blksize = fs->fs_frag; } break; /* * Adjust superblock summaries. fsck(8) is expected to * submit deltas when necessary. */ case FFS_ADJ_NDIR: #ifdef DEBUG if (fsckcmds) { printf("%s: adjust number of directories by %jd\n", mp->mnt_stat.f_mntonname, (intmax_t)cmd.value); } #endif /* DEBUG */ fs->fs_cstotal.cs_ndir += cmd.value; break; case FFS_ADJ_NBFREE: #ifdef DEBUG if (fsckcmds) { printf("%s: adjust number of free blocks by %+jd\n", mp->mnt_stat.f_mntonname, (intmax_t)cmd.value); } #endif /* DEBUG */ fs->fs_cstotal.cs_nbfree += cmd.value; break; case FFS_ADJ_NIFREE: #ifdef DEBUG if (fsckcmds) { printf("%s: adjust number of free inodes by %+jd\n", mp->mnt_stat.f_mntonname, (intmax_t)cmd.value); } #endif /* DEBUG */ fs->fs_cstotal.cs_nifree += cmd.value; break; case FFS_ADJ_NFFREE: #ifdef DEBUG if (fsckcmds) { printf("%s: adjust number of free frags by %+jd\n", mp->mnt_stat.f_mntonname, (intmax_t)cmd.value); } #endif /* DEBUG */ fs->fs_cstotal.cs_nffree += cmd.value; break; case FFS_ADJ_NUMCLUSTERS: #ifdef DEBUG if (fsckcmds) { printf("%s: adjust number of free clusters by %+jd\n", mp->mnt_stat.f_mntonname, (intmax_t)cmd.value); } #endif /* DEBUG */ fs->fs_cstotal.cs_numclusters += cmd.value; break; case FFS_SET_CWD: #ifdef DEBUG if (fsckcmds) { printf("%s: set current directory to inode %jd\n", mp->mnt_stat.f_mntonname, (intmax_t)cmd.value); } #endif /* DEBUG */ if ((error = ffs_vget(mp, (ino_t)cmd.value, LK_SHARED, &vp))) break; AUDIT_ARG_VNODE1(vp); if ((error = change_dir(vp, td)) != 0) { vput(vp); break; } VOP_UNLOCK(vp, 0); pwd_chdir(td, vp); break; case FFS_SET_DOTDOT: #ifdef DEBUG if (fsckcmds) { printf("%s: change .. in cwd from %jd to %jd\n", mp->mnt_stat.f_mntonname, (intmax_t)cmd.value, (intmax_t)cmd.size); } #endif /* DEBUG */ /* * First we have to get and lock the parent directory * to which ".." points. */ error = ffs_vget(mp, (ino_t)cmd.value, LK_EXCLUSIVE, &fdvp); if (error) break; /* * Now we get and lock the child directory containing "..". */ FILEDESC_SLOCK(td->td_proc->p_fd); dvp = td->td_proc->p_fd->fd_cdir; FILEDESC_SUNLOCK(td->td_proc->p_fd); if ((error = vget(dvp, LK_EXCLUSIVE, td)) != 0) { vput(fdvp); break; } dp = VTOI(dvp); dp->i_offset = 12; /* XXX mastertemplate.dot_reclen */ error = ufs_dirrewrite(dp, VTOI(fdvp), (ino_t)cmd.size, DT_DIR, 0); cache_purge(fdvp); cache_purge(dvp); vput(dvp); vput(fdvp); break; case FFS_UNLINK: #ifdef DEBUG if (fsckcmds) { char buf[32]; if (copyinstr((char *)(intptr_t)cmd.value, buf,32,NULL)) strncpy(buf, "Name_too_long", 32); printf("%s: unlink %s (inode %jd)\n", mp->mnt_stat.f_mntonname, buf, (intmax_t)cmd.size); } #endif /* DEBUG */ /* * kern_unlinkat will do its own start/finish writes and * they do not nest, so drop ours here. Setting mp == NULL * indicates that vn_finished_write is not needed down below. */ vn_finished_write(mp); mp = NULL; error = kern_unlinkat(td, AT_FDCWD, (char *)(intptr_t)cmd.value, UIO_USERSPACE, (ino_t)cmd.size); break; case FFS_SET_INODE: if (ump->um_fsckpid != td->td_proc->p_pid) { error = EPERM; break; } #ifdef DEBUG if (fsckcmds) { printf("%s: update inode %jd\n", mp->mnt_stat.f_mntonname, (intmax_t)cmd.value); } #endif /* DEBUG */ if ((error = ffs_vget(mp, (ino_t)cmd.value, LK_EXCLUSIVE, &vp))) break; AUDIT_ARG_VNODE1(vp); ip = VTOI(vp); if (I_IS_UFS1(ip)) error = copyin((void *)(intptr_t)cmd.size, ip->i_din1, sizeof(struct ufs1_dinode)); else error = copyin((void *)(intptr_t)cmd.size, ip->i_din2, sizeof(struct ufs2_dinode)); if (error) { vput(vp); break; } ip->i_flag |= IN_CHANGE | IN_MODIFIED; error = ffs_update(vp, 1); vput(vp); break; case FFS_SET_BUFOUTPUT: if (ump->um_fsckpid != td->td_proc->p_pid) { error = EPERM; break; } if (ITOUMP(VTOI(vp)) != ump) { error = EINVAL; break; } #ifdef DEBUG if (fsckcmds) { printf("%s: %s buffered output for descriptor %jd\n", mp->mnt_stat.f_mntonname, cmd.size == 1 ? "enable" : "disable", (intmax_t)cmd.value); } #endif /* DEBUG */ if ((error = getvnode(td, cmd.value, cap_rights_init(&rights, CAP_FSCK), &vfp)) != 0) break; if (vfp->f_vnode->v_type != VCHR) { fdrop(vfp, td); error = EINVAL; break; } if (origops == NULL) { origops = vfp->f_ops; bcopy((void *)origops, (void *)&bufferedops, sizeof(bufferedops)); bufferedops.fo_write = buffered_write; } if (cmd.size == 1) atomic_store_rel_ptr((volatile uintptr_t *)&vfp->f_ops, (uintptr_t)&bufferedops); else atomic_store_rel_ptr((volatile uintptr_t *)&vfp->f_ops, (uintptr_t)origops); fdrop(vfp, td); break; default: #ifdef DEBUG if (fsckcmds) { printf("Invalid request %d from fsck\n", oidp->oid_number); } #endif /* DEBUG */ error = EINVAL; break; } fdrop(fp, td); vn_finished_write(mp); return (error); } /* * Function to switch a descriptor to use the buffer cache to stage * its I/O. This is needed so that writes to the filesystem device * will give snapshots a chance to copy modified blocks for which it * needs to retain copies. */ static int buffered_write(fp, uio, active_cred, flags, td) struct file *fp; struct uio *uio; struct ucred *active_cred; int flags; struct thread *td; { struct vnode *devvp, *vp; struct inode *ip; struct buf *bp; struct fs *fs; struct filedesc *fdp; int error; daddr_t lbn; /* * The devvp is associated with the /dev filesystem. To discover * the filesystem with which the device is associated, we depend * on the application setting the current directory to a location * within the filesystem being written. Yes, this is an ugly hack. */ devvp = fp->f_vnode; if (!vn_isdisk(devvp, NULL)) return (EINVAL); fdp = td->td_proc->p_fd; FILEDESC_SLOCK(fdp); vp = fdp->fd_cdir; vref(vp); FILEDESC_SUNLOCK(fdp); vn_lock(vp, LK_SHARED | LK_RETRY); /* * Check that the current directory vnode indeed belongs to * UFS before trying to dereference UFS-specific v_data fields. */ if (vp->v_op != &ffs_vnodeops1 && vp->v_op != &ffs_vnodeops2) { vput(vp); return (EINVAL); } ip = VTOI(vp); if (ITODEVVP(ip) != devvp) { vput(vp); return (EINVAL); } fs = ITOFS(ip); vput(vp); foffset_lock_uio(fp, uio, flags); vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY); #ifdef DEBUG if (fsckcmds) { printf("%s: buffered write for block %jd\n", fs->fs_fsmnt, (intmax_t)btodb(uio->uio_offset)); } #endif /* DEBUG */ /* * All I/O must be contained within a filesystem block, start on * a fragment boundary, and be a multiple of fragments in length. */ if (uio->uio_resid > fs->fs_bsize - (uio->uio_offset % fs->fs_bsize) || fragoff(fs, uio->uio_offset) != 0 || fragoff(fs, uio->uio_resid) != 0) { error = EINVAL; goto out; } lbn = numfrags(fs, uio->uio_offset); bp = getblk(devvp, lbn, uio->uio_resid, 0, 0, 0); bp->b_flags |= B_RELBUF; if ((error = uiomove((char *)bp->b_data, uio->uio_resid, uio)) != 0) { brelse(bp); goto out; } error = bwrite(bp); out: VOP_UNLOCK(devvp, 0); foffset_unlock_uio(fp, uio, flags | FOF_NEXTOFF); return (error); } Index: head/sys/ufs/ffs/ffs_balloc.c =================================================================== --- head/sys/ufs/ffs/ffs_balloc.c (revision 313779) +++ head/sys/ufs/ffs/ffs_balloc.c (revision 313780) @@ -1,1148 +1,1150 @@ /*- * Copyright (c) 2002 Networks Associates Technology, Inc. * All rights reserved. * * This software was developed for the FreeBSD Project by Marshall * Kirk McKusick and Network Associates Laboratories, the Security * Research Division of Network Associates, Inc. under DARPA/SPAWAR * contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA CHATS * research program * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * Copyright (c) 1982, 1986, 1989, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ffs_balloc.c 8.8 (Berkeley) 6/16/95 */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * Balloc defines the structure of filesystem storage * by allocating the physical blocks on a device given * the inode and the logical block number in a file. * This is the allocation strategy for UFS1. Below is * the allocation strategy for UFS2. */ int ffs_balloc_ufs1(struct vnode *vp, off_t startoffset, int size, struct ucred *cred, int flags, struct buf **bpp) { struct inode *ip; struct ufs1_dinode *dp; ufs_lbn_t lbn, lastlbn; struct fs *fs; ufs1_daddr_t nb; struct buf *bp, *nbp; struct ufsmount *ump; - struct indir indirs[NIADDR + 2]; + struct indir indirs[UFS_NIADDR + 2]; int deallocated, osize, nsize, num, i, error; ufs2_daddr_t newb; ufs1_daddr_t *bap, pref; - ufs1_daddr_t *allocib, *blkp, *allocblk, allociblk[NIADDR + 1]; - ufs2_daddr_t *lbns_remfree, lbns[NIADDR + 1]; + ufs1_daddr_t *allocib, *blkp, *allocblk, allociblk[UFS_NIADDR + 1]; + ufs2_daddr_t *lbns_remfree, lbns[UFS_NIADDR + 1]; int unwindidx = -1; int saved_inbdflush; static struct timeval lastfail; static int curfail; int gbflags, reclaimed; ip = VTOI(vp); dp = ip->i_din1; fs = ITOFS(ip); ump = ITOUMP(ip); lbn = lblkno(fs, startoffset); size = blkoff(fs, startoffset) + size; reclaimed = 0; if (size > fs->fs_bsize) panic("ffs_balloc_ufs1: blk too big"); *bpp = NULL; if (flags & IO_EXT) return (EOPNOTSUPP); if (lbn < 0) return (EFBIG); gbflags = (flags & BA_UNMAPPED) != 0 ? GB_UNMAPPED : 0; if (DOINGSOFTDEP(vp)) softdep_prealloc(vp, MNT_WAIT); /* * If the next write will extend the file into a new block, * and the file is currently composed of a fragment * this fragment has to be extended to be a full block. */ lastlbn = lblkno(fs, ip->i_size); - if (lastlbn < NDADDR && lastlbn < lbn) { + if (lastlbn < UFS_NDADDR && lastlbn < lbn) { nb = lastlbn; osize = blksize(fs, ip, nb); if (osize < fs->fs_bsize && osize > 0) { UFS_LOCK(ump); error = ffs_realloccg(ip, nb, dp->di_db[nb], ffs_blkpref_ufs1(ip, lastlbn, (int)nb, &dp->di_db[0]), osize, (int)fs->fs_bsize, flags, cred, &bp); if (error) return (error); if (DOINGSOFTDEP(vp)) softdep_setup_allocdirect(ip, nb, dbtofsb(fs, bp->b_blkno), dp->di_db[nb], fs->fs_bsize, osize, bp); ip->i_size = smalllblktosize(fs, nb + 1); dp->di_size = ip->i_size; dp->di_db[nb] = dbtofsb(fs, bp->b_blkno); ip->i_flag |= IN_CHANGE | IN_UPDATE; if (flags & IO_SYNC) bwrite(bp); else if (DOINGASYNC(vp)) bdwrite(bp); else bawrite(bp); } } /* - * The first NDADDR blocks are direct blocks + * The first UFS_NDADDR blocks are direct blocks */ - if (lbn < NDADDR) { + if (lbn < UFS_NDADDR) { if (flags & BA_METAONLY) panic("ffs_balloc_ufs1: BA_METAONLY for direct block"); nb = dp->di_db[lbn]; if (nb != 0 && ip->i_size >= smalllblktosize(fs, lbn + 1)) { error = bread(vp, lbn, fs->fs_bsize, NOCRED, &bp); if (error) { brelse(bp); return (error); } bp->b_blkno = fsbtodb(fs, nb); *bpp = bp; return (0); } if (nb != 0) { /* * Consider need to reallocate a fragment. */ osize = fragroundup(fs, blkoff(fs, ip->i_size)); nsize = fragroundup(fs, size); if (nsize <= osize) { error = bread(vp, lbn, osize, NOCRED, &bp); if (error) { brelse(bp); return (error); } bp->b_blkno = fsbtodb(fs, nb); } else { UFS_LOCK(ump); error = ffs_realloccg(ip, lbn, dp->di_db[lbn], ffs_blkpref_ufs1(ip, lbn, (int)lbn, &dp->di_db[0]), osize, nsize, flags, cred, &bp); if (error) return (error); if (DOINGSOFTDEP(vp)) softdep_setup_allocdirect(ip, lbn, dbtofsb(fs, bp->b_blkno), nb, nsize, osize, bp); } } else { if (ip->i_size < smalllblktosize(fs, lbn + 1)) nsize = fragroundup(fs, size); else nsize = fs->fs_bsize; UFS_LOCK(ump); error = ffs_alloc(ip, lbn, ffs_blkpref_ufs1(ip, lbn, (int)lbn, &dp->di_db[0]), nsize, flags, cred, &newb); if (error) return (error); bp = getblk(vp, lbn, nsize, 0, 0, gbflags); bp->b_blkno = fsbtodb(fs, newb); if (flags & BA_CLRBUF) vfs_bio_clrbuf(bp); if (DOINGSOFTDEP(vp)) softdep_setup_allocdirect(ip, lbn, newb, 0, nsize, 0, bp); } dp->di_db[lbn] = dbtofsb(fs, bp->b_blkno); ip->i_flag |= IN_CHANGE | IN_UPDATE; *bpp = bp; return (0); } /* * Determine the number of levels of indirection. */ pref = 0; if ((error = ufs_getlbns(vp, lbn, indirs, &num)) != 0) return(error); #ifdef INVARIANTS if (num < 1) panic ("ffs_balloc_ufs1: ufs_getlbns returned indirect block"); #endif saved_inbdflush = curthread_pflags_set(TDP_INBDFLUSH); /* * Fetch the first indirect block allocating if necessary. */ --num; nb = dp->di_ib[indirs[0].in_off]; allocib = NULL; allocblk = allociblk; lbns_remfree = lbns; if (nb == 0) { UFS_LOCK(ump); pref = ffs_blkpref_ufs1(ip, lbn, -indirs[0].in_off - 1, (ufs1_daddr_t *)0); if ((error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize, flags, cred, &newb)) != 0) { curthread_pflags_restore(saved_inbdflush); return (error); } pref = newb + fs->fs_frag; nb = newb; MPASS(allocblk < allociblk + nitems(allociblk)); MPASS(lbns_remfree < lbns + nitems(lbns)); *allocblk++ = nb; *lbns_remfree++ = indirs[1].in_lbn; bp = getblk(vp, indirs[1].in_lbn, fs->fs_bsize, 0, 0, gbflags); bp->b_blkno = fsbtodb(fs, nb); vfs_bio_clrbuf(bp); if (DOINGSOFTDEP(vp)) { - softdep_setup_allocdirect(ip, NDADDR + indirs[0].in_off, - newb, 0, fs->fs_bsize, 0, bp); + softdep_setup_allocdirect(ip, + UFS_NDADDR + indirs[0].in_off, newb, 0, + fs->fs_bsize, 0, bp); bdwrite(bp); } else if ((flags & IO_SYNC) == 0 && DOINGASYNC(vp)) { if (bp->b_bufsize == fs->fs_bsize) bp->b_flags |= B_CLUSTEROK; bdwrite(bp); } else { if ((error = bwrite(bp)) != 0) goto fail; } allocib = &dp->di_ib[indirs[0].in_off]; *allocib = nb; ip->i_flag |= IN_CHANGE | IN_UPDATE; } /* * Fetch through the indirect blocks, allocating as necessary. */ retry: for (i = 1;;) { error = bread(vp, indirs[i].in_lbn, (int)fs->fs_bsize, NOCRED, &bp); if (error) { brelse(bp); goto fail; } bap = (ufs1_daddr_t *)bp->b_data; nb = bap[indirs[i].in_off]; if (i == num) break; i += 1; if (nb != 0) { bqrelse(bp); continue; } UFS_LOCK(ump); /* * If parent indirect has just been allocated, try to cluster * immediately following it. */ if (pref == 0) pref = ffs_blkpref_ufs1(ip, lbn, i - num - 1, (ufs1_daddr_t *)0); if ((error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize, flags | IO_BUFLOCKED, cred, &newb)) != 0) { brelse(bp); if (DOINGSOFTDEP(vp) && ++reclaimed == 1) { UFS_LOCK(ump); softdep_request_cleanup(fs, vp, cred, FLUSH_BLOCKS_WAIT); UFS_UNLOCK(ump); goto retry; } if (ppsratecheck(&lastfail, &curfail, 1)) { ffs_fserr(fs, ip->i_number, "filesystem full"); uprintf("\n%s: write failed, filesystem " "is full\n", fs->fs_fsmnt); } goto fail; } pref = newb + fs->fs_frag; nb = newb; MPASS(allocblk < allociblk + nitems(allociblk)); MPASS(lbns_remfree < lbns + nitems(lbns)); *allocblk++ = nb; *lbns_remfree++ = indirs[i].in_lbn; nbp = getblk(vp, indirs[i].in_lbn, fs->fs_bsize, 0, 0, 0); nbp->b_blkno = fsbtodb(fs, nb); vfs_bio_clrbuf(nbp); if (DOINGSOFTDEP(vp)) { softdep_setup_allocindir_meta(nbp, ip, bp, indirs[i - 1].in_off, nb); bdwrite(nbp); } else if ((flags & IO_SYNC) == 0 && DOINGASYNC(vp)) { if (nbp->b_bufsize == fs->fs_bsize) nbp->b_flags |= B_CLUSTEROK; bdwrite(nbp); } else { if ((error = bwrite(nbp)) != 0) { brelse(bp); goto fail; } } bap[indirs[i - 1].in_off] = nb; if (allocib == NULL && unwindidx < 0) unwindidx = i - 1; /* * If required, write synchronously, otherwise use * delayed write. */ if (flags & IO_SYNC) { bwrite(bp); } else { if (bp->b_bufsize == fs->fs_bsize) bp->b_flags |= B_CLUSTEROK; bdwrite(bp); } } /* * If asked only for the indirect block, then return it. */ if (flags & BA_METAONLY) { curthread_pflags_restore(saved_inbdflush); *bpp = bp; return (0); } /* * Get the data block, allocating if necessary. */ if (nb == 0) { UFS_LOCK(ump); /* * If allocating metadata at the front of the cylinder * group and parent indirect block has just been allocated, * then cluster next to it if it is the first indirect in * the file. Otherwise it has been allocated in the metadata * area, so we want to find our own place out in the data area. */ - if (pref == 0 || (lbn > NDADDR && fs->fs_metaspace != 0)) + if (pref == 0 || (lbn > UFS_NDADDR && fs->fs_metaspace != 0)) pref = ffs_blkpref_ufs1(ip, lbn, indirs[i].in_off, &bap[0]); error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize, flags | IO_BUFLOCKED, cred, &newb); if (error) { brelse(bp); if (DOINGSOFTDEP(vp) && ++reclaimed == 1) { UFS_LOCK(ump); softdep_request_cleanup(fs, vp, cred, FLUSH_BLOCKS_WAIT); UFS_UNLOCK(ump); goto retry; } if (ppsratecheck(&lastfail, &curfail, 1)) { ffs_fserr(fs, ip->i_number, "filesystem full"); uprintf("\n%s: write failed, filesystem " "is full\n", fs->fs_fsmnt); } goto fail; } nb = newb; MPASS(allocblk < allociblk + nitems(allociblk)); MPASS(lbns_remfree < lbns + nitems(lbns)); *allocblk++ = nb; *lbns_remfree++ = lbn; nbp = getblk(vp, lbn, fs->fs_bsize, 0, 0, gbflags); nbp->b_blkno = fsbtodb(fs, nb); if (flags & BA_CLRBUF) vfs_bio_clrbuf(nbp); if (DOINGSOFTDEP(vp)) softdep_setup_allocindir_page(ip, lbn, bp, indirs[i].in_off, nb, 0, nbp); bap[indirs[i].in_off] = nb; /* * If required, write synchronously, otherwise use * delayed write. */ if (flags & IO_SYNC) { bwrite(bp); } else { if (bp->b_bufsize == fs->fs_bsize) bp->b_flags |= B_CLUSTEROK; bdwrite(bp); } curthread_pflags_restore(saved_inbdflush); *bpp = nbp; return (0); } brelse(bp); if (flags & BA_CLRBUF) { int seqcount = (flags & BA_SEQMASK) >> BA_SEQSHIFT; if (seqcount != 0 && (vp->v_mount->mnt_flag & MNT_NOCLUSTERR) == 0 && !(vm_page_count_severe() || buf_dirty_count_severe())) { error = cluster_read(vp, ip->i_size, lbn, (int)fs->fs_bsize, NOCRED, MAXBSIZE, seqcount, gbflags, &nbp); } else { error = bread_gb(vp, lbn, (int)fs->fs_bsize, NOCRED, gbflags, &nbp); } if (error) { brelse(nbp); goto fail; } } else { nbp = getblk(vp, lbn, fs->fs_bsize, 0, 0, gbflags); nbp->b_blkno = fsbtodb(fs, nb); } curthread_pflags_restore(saved_inbdflush); *bpp = nbp; return (0); fail: curthread_pflags_restore(saved_inbdflush); /* * If we have failed to allocate any blocks, simply return the error. * This is the usual case and avoids the need to fsync the file. */ if (allocblk == allociblk && allocib == NULL && unwindidx == -1) return (error); /* * If we have failed part way through block allocation, we * have to deallocate any indirect blocks that we have allocated. * We have to fsync the file before we start to get rid of all * of its dependencies so that we do not leave them dangling. * We have to sync it at the end so that the soft updates code * does not find any untracked changes. Although this is really * slow, running out of disk space is not expected to be a common * occurrence. The error return from fsync is ignored as we already * have an error to return to the user. * * XXX Still have to journal the free below */ (void) ffs_syncvnode(vp, MNT_WAIT, 0); for (deallocated = 0, blkp = allociblk, lbns_remfree = lbns; blkp < allocblk; blkp++, lbns_remfree++) { /* * We shall not leave the freed blocks on the vnode * buffer object lists. */ bp = getblk(vp, *lbns_remfree, fs->fs_bsize, 0, 0, GB_NOCREAT | GB_UNMAPPED); if (bp != NULL) { KASSERT(bp->b_blkno == fsbtodb(fs, *blkp), ("mismatch1 l %jd %jd b %ju %ju", (intmax_t)bp->b_lblkno, (uintmax_t)*lbns_remfree, (uintmax_t)bp->b_blkno, (uintmax_t)fsbtodb(fs, *blkp))); bp->b_flags |= B_INVAL | B_RELBUF | B_NOCACHE; bp->b_flags &= ~(B_ASYNC | B_CACHE); brelse(bp); } deallocated += fs->fs_bsize; } if (allocib != NULL) { *allocib = 0; } else if (unwindidx >= 0) { int r; r = bread(vp, indirs[unwindidx].in_lbn, (int)fs->fs_bsize, NOCRED, &bp); if (r) { panic("Could not unwind indirect block, error %d", r); brelse(bp); } else { bap = (ufs1_daddr_t *)bp->b_data; bap[indirs[unwindidx].in_off] = 0; if (flags & IO_SYNC) { bwrite(bp); } else { if (bp->b_bufsize == fs->fs_bsize) bp->b_flags |= B_CLUSTEROK; bdwrite(bp); } } } if (deallocated) { #ifdef QUOTA /* * Restore user's disk quota because allocation failed. */ (void) chkdq(ip, -btodb(deallocated), cred, FORCE); #endif dp->di_blocks -= btodb(deallocated); ip->i_flag |= IN_CHANGE | IN_UPDATE; } (void) ffs_syncvnode(vp, MNT_WAIT, 0); /* * After the buffers are invalidated and on-disk pointers are * cleared, free the blocks. */ for (blkp = allociblk; blkp < allocblk; blkp++) { #ifdef INVARIANTS if (blkp == allociblk) lbns_remfree = lbns; bp = getblk(vp, *lbns_remfree, fs->fs_bsize, 0, 0, GB_NOCREAT | GB_UNMAPPED); if (bp != NULL) { panic("zombie1 %jd %ju %ju", (intmax_t)bp->b_lblkno, (uintmax_t)bp->b_blkno, (uintmax_t)fsbtodb(fs, *blkp)); } lbns_remfree++; #endif ffs_blkfree(ump, fs, ump->um_devvp, *blkp, fs->fs_bsize, ip->i_number, vp->v_type, NULL); } return (error); } /* * Balloc defines the structure of file system storage * by allocating the physical blocks on a device given * the inode and the logical block number in a file. * This is the allocation strategy for UFS2. Above is * the allocation strategy for UFS1. */ int ffs_balloc_ufs2(struct vnode *vp, off_t startoffset, int size, struct ucred *cred, int flags, struct buf **bpp) { struct inode *ip; struct ufs2_dinode *dp; ufs_lbn_t lbn, lastlbn; struct fs *fs; struct buf *bp, *nbp; struct ufsmount *ump; - struct indir indirs[NIADDR + 2]; + struct indir indirs[UFS_NIADDR + 2]; ufs2_daddr_t nb, newb, *bap, pref; - ufs2_daddr_t *allocib, *blkp, *allocblk, allociblk[NIADDR + 1]; - ufs2_daddr_t *lbns_remfree, lbns[NIADDR + 1]; + ufs2_daddr_t *allocib, *blkp, *allocblk, allociblk[UFS_NIADDR + 1]; + ufs2_daddr_t *lbns_remfree, lbns[UFS_NIADDR + 1]; int deallocated, osize, nsize, num, i, error; int unwindidx = -1; int saved_inbdflush; static struct timeval lastfail; static int curfail; int gbflags, reclaimed; ip = VTOI(vp); dp = ip->i_din2; fs = ITOFS(ip); ump = ITOUMP(ip); lbn = lblkno(fs, startoffset); size = blkoff(fs, startoffset) + size; reclaimed = 0; if (size > fs->fs_bsize) panic("ffs_balloc_ufs2: blk too big"); *bpp = NULL; if (lbn < 0) return (EFBIG); gbflags = (flags & BA_UNMAPPED) != 0 ? GB_UNMAPPED : 0; if (DOINGSOFTDEP(vp)) softdep_prealloc(vp, MNT_WAIT); /* * Check for allocating external data. */ if (flags & IO_EXT) { - if (lbn >= NXADDR) + if (lbn >= UFS_NXADDR) return (EFBIG); /* * If the next write will extend the data into a new block, * and the data is currently composed of a fragment * this fragment has to be extended to be a full block. */ lastlbn = lblkno(fs, dp->di_extsize); if (lastlbn < lbn) { nb = lastlbn; osize = sblksize(fs, dp->di_extsize, nb); if (osize < fs->fs_bsize && osize > 0) { UFS_LOCK(ump); error = ffs_realloccg(ip, -1 - nb, dp->di_extb[nb], ffs_blkpref_ufs2(ip, lastlbn, (int)nb, &dp->di_extb[0]), osize, (int)fs->fs_bsize, flags, cred, &bp); if (error) return (error); if (DOINGSOFTDEP(vp)) softdep_setup_allocext(ip, nb, dbtofsb(fs, bp->b_blkno), dp->di_extb[nb], fs->fs_bsize, osize, bp); dp->di_extsize = smalllblktosize(fs, nb + 1); dp->di_extb[nb] = dbtofsb(fs, bp->b_blkno); bp->b_xflags |= BX_ALTDATA; ip->i_flag |= IN_CHANGE; if (flags & IO_SYNC) bwrite(bp); else bawrite(bp); } } /* * All blocks are direct blocks */ if (flags & BA_METAONLY) panic("ffs_balloc_ufs2: BA_METAONLY for ext block"); nb = dp->di_extb[lbn]; if (nb != 0 && dp->di_extsize >= smalllblktosize(fs, lbn + 1)) { error = bread_gb(vp, -1 - lbn, fs->fs_bsize, NOCRED, gbflags, &bp); if (error) { brelse(bp); return (error); } bp->b_blkno = fsbtodb(fs, nb); bp->b_xflags |= BX_ALTDATA; *bpp = bp; return (0); } if (nb != 0) { /* * Consider need to reallocate a fragment. */ osize = fragroundup(fs, blkoff(fs, dp->di_extsize)); nsize = fragroundup(fs, size); if (nsize <= osize) { error = bread_gb(vp, -1 - lbn, osize, NOCRED, gbflags, &bp); if (error) { brelse(bp); return (error); } bp->b_blkno = fsbtodb(fs, nb); bp->b_xflags |= BX_ALTDATA; } else { UFS_LOCK(ump); error = ffs_realloccg(ip, -1 - lbn, dp->di_extb[lbn], ffs_blkpref_ufs2(ip, lbn, (int)lbn, &dp->di_extb[0]), osize, nsize, flags, cred, &bp); if (error) return (error); bp->b_xflags |= BX_ALTDATA; if (DOINGSOFTDEP(vp)) softdep_setup_allocext(ip, lbn, dbtofsb(fs, bp->b_blkno), nb, nsize, osize, bp); } } else { if (dp->di_extsize < smalllblktosize(fs, lbn + 1)) nsize = fragroundup(fs, size); else nsize = fs->fs_bsize; UFS_LOCK(ump); error = ffs_alloc(ip, lbn, ffs_blkpref_ufs2(ip, lbn, (int)lbn, &dp->di_extb[0]), nsize, flags, cred, &newb); if (error) return (error); bp = getblk(vp, -1 - lbn, nsize, 0, 0, gbflags); bp->b_blkno = fsbtodb(fs, newb); bp->b_xflags |= BX_ALTDATA; if (flags & BA_CLRBUF) vfs_bio_clrbuf(bp); if (DOINGSOFTDEP(vp)) softdep_setup_allocext(ip, lbn, newb, 0, nsize, 0, bp); } dp->di_extb[lbn] = dbtofsb(fs, bp->b_blkno); ip->i_flag |= IN_CHANGE; *bpp = bp; return (0); } /* * If the next write will extend the file into a new block, * and the file is currently composed of a fragment * this fragment has to be extended to be a full block. */ lastlbn = lblkno(fs, ip->i_size); - if (lastlbn < NDADDR && lastlbn < lbn) { + if (lastlbn < UFS_NDADDR && lastlbn < lbn) { nb = lastlbn; osize = blksize(fs, ip, nb); if (osize < fs->fs_bsize && osize > 0) { UFS_LOCK(ump); error = ffs_realloccg(ip, nb, dp->di_db[nb], ffs_blkpref_ufs2(ip, lastlbn, (int)nb, &dp->di_db[0]), osize, (int)fs->fs_bsize, flags, cred, &bp); if (error) return (error); if (DOINGSOFTDEP(vp)) softdep_setup_allocdirect(ip, nb, dbtofsb(fs, bp->b_blkno), dp->di_db[nb], fs->fs_bsize, osize, bp); ip->i_size = smalllblktosize(fs, nb + 1); dp->di_size = ip->i_size; dp->di_db[nb] = dbtofsb(fs, bp->b_blkno); ip->i_flag |= IN_CHANGE | IN_UPDATE; if (flags & IO_SYNC) bwrite(bp); else bawrite(bp); } } /* - * The first NDADDR blocks are direct blocks + * The first UFS_NDADDR blocks are direct blocks */ - if (lbn < NDADDR) { + if (lbn < UFS_NDADDR) { if (flags & BA_METAONLY) panic("ffs_balloc_ufs2: BA_METAONLY for direct block"); nb = dp->di_db[lbn]; if (nb != 0 && ip->i_size >= smalllblktosize(fs, lbn + 1)) { error = bread_gb(vp, lbn, fs->fs_bsize, NOCRED, gbflags, &bp); if (error) { brelse(bp); return (error); } bp->b_blkno = fsbtodb(fs, nb); *bpp = bp; return (0); } if (nb != 0) { /* * Consider need to reallocate a fragment. */ osize = fragroundup(fs, blkoff(fs, ip->i_size)); nsize = fragroundup(fs, size); if (nsize <= osize) { error = bread_gb(vp, lbn, osize, NOCRED, gbflags, &bp); if (error) { brelse(bp); return (error); } bp->b_blkno = fsbtodb(fs, nb); } else { UFS_LOCK(ump); error = ffs_realloccg(ip, lbn, dp->di_db[lbn], ffs_blkpref_ufs2(ip, lbn, (int)lbn, &dp->di_db[0]), osize, nsize, flags, cred, &bp); if (error) return (error); if (DOINGSOFTDEP(vp)) softdep_setup_allocdirect(ip, lbn, dbtofsb(fs, bp->b_blkno), nb, nsize, osize, bp); } } else { if (ip->i_size < smalllblktosize(fs, lbn + 1)) nsize = fragroundup(fs, size); else nsize = fs->fs_bsize; UFS_LOCK(ump); error = ffs_alloc(ip, lbn, ffs_blkpref_ufs2(ip, lbn, (int)lbn, &dp->di_db[0]), nsize, flags, cred, &newb); if (error) return (error); bp = getblk(vp, lbn, nsize, 0, 0, gbflags); bp->b_blkno = fsbtodb(fs, newb); if (flags & BA_CLRBUF) vfs_bio_clrbuf(bp); if (DOINGSOFTDEP(vp)) softdep_setup_allocdirect(ip, lbn, newb, 0, nsize, 0, bp); } dp->di_db[lbn] = dbtofsb(fs, bp->b_blkno); ip->i_flag |= IN_CHANGE | IN_UPDATE; *bpp = bp; return (0); } /* * Determine the number of levels of indirection. */ pref = 0; if ((error = ufs_getlbns(vp, lbn, indirs, &num)) != 0) return(error); #ifdef INVARIANTS if (num < 1) panic ("ffs_balloc_ufs2: ufs_getlbns returned indirect block"); #endif saved_inbdflush = curthread_pflags_set(TDP_INBDFLUSH); /* * Fetch the first indirect block allocating if necessary. */ --num; nb = dp->di_ib[indirs[0].in_off]; allocib = NULL; allocblk = allociblk; lbns_remfree = lbns; if (nb == 0) { UFS_LOCK(ump); pref = ffs_blkpref_ufs2(ip, lbn, -indirs[0].in_off - 1, (ufs2_daddr_t *)0); if ((error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize, flags, cred, &newb)) != 0) { curthread_pflags_restore(saved_inbdflush); return (error); } pref = newb + fs->fs_frag; nb = newb; MPASS(allocblk < allociblk + nitems(allociblk)); MPASS(lbns_remfree < lbns + nitems(lbns)); *allocblk++ = nb; *lbns_remfree++ = indirs[1].in_lbn; bp = getblk(vp, indirs[1].in_lbn, fs->fs_bsize, 0, 0, GB_UNMAPPED); bp->b_blkno = fsbtodb(fs, nb); vfs_bio_clrbuf(bp); if (DOINGSOFTDEP(vp)) { - softdep_setup_allocdirect(ip, NDADDR + indirs[0].in_off, - newb, 0, fs->fs_bsize, 0, bp); + softdep_setup_allocdirect(ip, + UFS_NDADDR + indirs[0].in_off, newb, 0, + fs->fs_bsize, 0, bp); bdwrite(bp); } else if ((flags & IO_SYNC) == 0 && DOINGASYNC(vp)) { if (bp->b_bufsize == fs->fs_bsize) bp->b_flags |= B_CLUSTEROK; bdwrite(bp); } else { if ((error = bwrite(bp)) != 0) goto fail; } allocib = &dp->di_ib[indirs[0].in_off]; *allocib = nb; ip->i_flag |= IN_CHANGE | IN_UPDATE; } /* * Fetch through the indirect blocks, allocating as necessary. */ retry: for (i = 1;;) { error = bread(vp, indirs[i].in_lbn, (int)fs->fs_bsize, NOCRED, &bp); if (error) { brelse(bp); goto fail; } bap = (ufs2_daddr_t *)bp->b_data; nb = bap[indirs[i].in_off]; if (i == num) break; i += 1; if (nb != 0) { bqrelse(bp); continue; } UFS_LOCK(ump); /* * If parent indirect has just been allocated, try to cluster * immediately following it. */ if (pref == 0) pref = ffs_blkpref_ufs2(ip, lbn, i - num - 1, (ufs2_daddr_t *)0); if ((error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize, flags | IO_BUFLOCKED, cred, &newb)) != 0) { brelse(bp); if (DOINGSOFTDEP(vp) && ++reclaimed == 1) { UFS_LOCK(ump); softdep_request_cleanup(fs, vp, cred, FLUSH_BLOCKS_WAIT); UFS_UNLOCK(ump); goto retry; } if (ppsratecheck(&lastfail, &curfail, 1)) { ffs_fserr(fs, ip->i_number, "filesystem full"); uprintf("\n%s: write failed, filesystem " "is full\n", fs->fs_fsmnt); } goto fail; } pref = newb + fs->fs_frag; nb = newb; MPASS(allocblk < allociblk + nitems(allociblk)); MPASS(lbns_remfree < lbns + nitems(lbns)); *allocblk++ = nb; *lbns_remfree++ = indirs[i].in_lbn; nbp = getblk(vp, indirs[i].in_lbn, fs->fs_bsize, 0, 0, GB_UNMAPPED); nbp->b_blkno = fsbtodb(fs, nb); vfs_bio_clrbuf(nbp); if (DOINGSOFTDEP(vp)) { softdep_setup_allocindir_meta(nbp, ip, bp, indirs[i - 1].in_off, nb); bdwrite(nbp); } else if ((flags & IO_SYNC) == 0 && DOINGASYNC(vp)) { if (nbp->b_bufsize == fs->fs_bsize) nbp->b_flags |= B_CLUSTEROK; bdwrite(nbp); } else { if ((error = bwrite(nbp)) != 0) { brelse(bp); goto fail; } } bap[indirs[i - 1].in_off] = nb; if (allocib == NULL && unwindidx < 0) unwindidx = i - 1; /* * If required, write synchronously, otherwise use * delayed write. */ if (flags & IO_SYNC) { bwrite(bp); } else { if (bp->b_bufsize == fs->fs_bsize) bp->b_flags |= B_CLUSTEROK; bdwrite(bp); } } /* * If asked only for the indirect block, then return it. */ if (flags & BA_METAONLY) { curthread_pflags_restore(saved_inbdflush); *bpp = bp; return (0); } /* * Get the data block, allocating if necessary. */ if (nb == 0) { UFS_LOCK(ump); /* * If allocating metadata at the front of the cylinder * group and parent indirect block has just been allocated, * then cluster next to it if it is the first indirect in * the file. Otherwise it has been allocated in the metadata * area, so we want to find our own place out in the data area. */ - if (pref == 0 || (lbn > NDADDR && fs->fs_metaspace != 0)) + if (pref == 0 || (lbn > UFS_NDADDR && fs->fs_metaspace != 0)) pref = ffs_blkpref_ufs2(ip, lbn, indirs[i].in_off, &bap[0]); error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize, flags | IO_BUFLOCKED, cred, &newb); if (error) { brelse(bp); if (DOINGSOFTDEP(vp) && ++reclaimed == 1) { UFS_LOCK(ump); softdep_request_cleanup(fs, vp, cred, FLUSH_BLOCKS_WAIT); UFS_UNLOCK(ump); goto retry; } if (ppsratecheck(&lastfail, &curfail, 1)) { ffs_fserr(fs, ip->i_number, "filesystem full"); uprintf("\n%s: write failed, filesystem " "is full\n", fs->fs_fsmnt); } goto fail; } nb = newb; MPASS(allocblk < allociblk + nitems(allociblk)); MPASS(lbns_remfree < lbns + nitems(lbns)); *allocblk++ = nb; *lbns_remfree++ = lbn; nbp = getblk(vp, lbn, fs->fs_bsize, 0, 0, gbflags); nbp->b_blkno = fsbtodb(fs, nb); if (flags & BA_CLRBUF) vfs_bio_clrbuf(nbp); if (DOINGSOFTDEP(vp)) softdep_setup_allocindir_page(ip, lbn, bp, indirs[i].in_off, nb, 0, nbp); bap[indirs[i].in_off] = nb; /* * If required, write synchronously, otherwise use * delayed write. */ if (flags & IO_SYNC) { bwrite(bp); } else { if (bp->b_bufsize == fs->fs_bsize) bp->b_flags |= B_CLUSTEROK; bdwrite(bp); } curthread_pflags_restore(saved_inbdflush); *bpp = nbp; return (0); } brelse(bp); /* * If requested clear invalid portions of the buffer. If we * have to do a read-before-write (typical if BA_CLRBUF is set), * try to do some read-ahead in the sequential case to reduce * the number of I/O transactions. */ if (flags & BA_CLRBUF) { int seqcount = (flags & BA_SEQMASK) >> BA_SEQSHIFT; if (seqcount != 0 && (vp->v_mount->mnt_flag & MNT_NOCLUSTERR) == 0 && !(vm_page_count_severe() || buf_dirty_count_severe())) { error = cluster_read(vp, ip->i_size, lbn, (int)fs->fs_bsize, NOCRED, MAXBSIZE, seqcount, gbflags, &nbp); } else { error = bread_gb(vp, lbn, (int)fs->fs_bsize, NOCRED, gbflags, &nbp); } if (error) { brelse(nbp); goto fail; } } else { nbp = getblk(vp, lbn, fs->fs_bsize, 0, 0, gbflags); nbp->b_blkno = fsbtodb(fs, nb); } curthread_pflags_restore(saved_inbdflush); *bpp = nbp; return (0); fail: curthread_pflags_restore(saved_inbdflush); /* * If we have failed to allocate any blocks, simply return the error. * This is the usual case and avoids the need to fsync the file. */ if (allocblk == allociblk && allocib == NULL && unwindidx == -1) return (error); /* * If we have failed part way through block allocation, we * have to deallocate any indirect blocks that we have allocated. * We have to fsync the file before we start to get rid of all * of its dependencies so that we do not leave them dangling. * We have to sync it at the end so that the soft updates code * does not find any untracked changes. Although this is really * slow, running out of disk space is not expected to be a common * occurrence. The error return from fsync is ignored as we already * have an error to return to the user. * * XXX Still have to journal the free below */ (void) ffs_syncvnode(vp, MNT_WAIT, 0); for (deallocated = 0, blkp = allociblk, lbns_remfree = lbns; blkp < allocblk; blkp++, lbns_remfree++) { /* * We shall not leave the freed blocks on the vnode * buffer object lists. */ bp = getblk(vp, *lbns_remfree, fs->fs_bsize, 0, 0, GB_NOCREAT | GB_UNMAPPED); if (bp != NULL) { KASSERT(bp->b_blkno == fsbtodb(fs, *blkp), ("mismatch2 l %jd %jd b %ju %ju", (intmax_t)bp->b_lblkno, (uintmax_t)*lbns_remfree, (uintmax_t)bp->b_blkno, (uintmax_t)fsbtodb(fs, *blkp))); bp->b_flags |= B_INVAL | B_RELBUF | B_NOCACHE; bp->b_flags &= ~(B_ASYNC | B_CACHE); brelse(bp); } deallocated += fs->fs_bsize; } if (allocib != NULL) { *allocib = 0; } else if (unwindidx >= 0) { int r; r = bread(vp, indirs[unwindidx].in_lbn, (int)fs->fs_bsize, NOCRED, &bp); if (r) { panic("Could not unwind indirect block, error %d", r); brelse(bp); } else { bap = (ufs2_daddr_t *)bp->b_data; bap[indirs[unwindidx].in_off] = 0; if (flags & IO_SYNC) { bwrite(bp); } else { if (bp->b_bufsize == fs->fs_bsize) bp->b_flags |= B_CLUSTEROK; bdwrite(bp); } } } if (deallocated) { #ifdef QUOTA /* * Restore user's disk quota because allocation failed. */ (void) chkdq(ip, -btodb(deallocated), cred, FORCE); #endif dp->di_blocks -= btodb(deallocated); ip->i_flag |= IN_CHANGE | IN_UPDATE; } (void) ffs_syncvnode(vp, MNT_WAIT, 0); /* * After the buffers are invalidated and on-disk pointers are * cleared, free the blocks. */ for (blkp = allociblk; blkp < allocblk; blkp++) { #ifdef INVARIANTS if (blkp == allociblk) lbns_remfree = lbns; bp = getblk(vp, *lbns_remfree, fs->fs_bsize, 0, 0, GB_NOCREAT | GB_UNMAPPED); if (bp != NULL) { panic("zombie2 %jd %ju %ju", (intmax_t)bp->b_lblkno, (uintmax_t)bp->b_blkno, (uintmax_t)fsbtodb(fs, *blkp)); } lbns_remfree++; #endif ffs_blkfree(ump, fs, ump->um_devvp, *blkp, fs->fs_bsize, ip->i_number, vp->v_type, NULL); } return (error); } Index: head/sys/ufs/ffs/ffs_inode.c =================================================================== --- head/sys/ufs/ffs/ffs_inode.c (revision 313779) +++ head/sys/ufs/ffs/ffs_inode.c (revision 313780) @@ -1,766 +1,767 @@ /*- * Copyright (c) 1982, 1986, 1989, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ffs_inode.c 8.13 (Berkeley) 4/21/95 */ #include __FBSDID("$FreeBSD$"); #include "opt_quota.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static int ffs_indirtrunc(struct inode *, ufs2_daddr_t, ufs2_daddr_t, ufs2_daddr_t, int, ufs2_daddr_t *); /* * Update the access, modified, and inode change times as specified by the * IN_ACCESS, IN_UPDATE, and IN_CHANGE flags respectively. Write the inode * to disk if the IN_MODIFIED flag is set (it may be set initially, or by * the timestamp update). The IN_LAZYMOD flag is set to force a write * later if not now. The IN_LAZYACCESS is set instead of IN_MODIFIED if the fs * is currently being suspended (or is suspended) and vnode has been accessed. * If we write now, then clear IN_MODIFIED, IN_LAZYACCESS and IN_LAZYMOD to * reflect the presumably successful write, and if waitfor is set, then wait * for the write to complete. */ int ffs_update(vp, waitfor) struct vnode *vp; int waitfor; { struct fs *fs; struct buf *bp; struct inode *ip; int flags, error; ASSERT_VOP_ELOCKED(vp, "ffs_update"); ufs_itimes(vp); ip = VTOI(vp); if ((ip->i_flag & IN_MODIFIED) == 0 && waitfor == 0) return (0); ip->i_flag &= ~(IN_LAZYACCESS | IN_LAZYMOD | IN_MODIFIED); fs = ITOFS(ip); if (fs->fs_ronly && ITOUMP(ip)->um_fsckpid == 0) return (0); /* * If we are updating a snapshot and another process is currently * writing the buffer containing the inode for this snapshot then * a deadlock can occur when it tries to check the snapshot to see * if that block needs to be copied. Thus when updating a snapshot * we check to see if the buffer is already locked, and if it is * we drop the snapshot lock until the buffer has been written * and is available to us. We have to grab a reference to the * snapshot vnode to prevent it from being removed while we are * waiting for the buffer. */ flags = 0; if (IS_SNAPSHOT(ip)) flags = GB_LOCK_NOWAIT; loop: error = breadn_flags(ITODEVVP(ip), fsbtodb(fs, ino_to_fsba(fs, ip->i_number)), (int) fs->fs_bsize, 0, 0, 0, NOCRED, flags, &bp); if (error != 0) { if (error != EBUSY) return (error); KASSERT((IS_SNAPSHOT(ip)), ("EBUSY from non-snapshot")); /* * Wait for our inode block to become available. * * Hold a reference to the vnode to protect against * ffs_snapgone(). Since we hold a reference, it can only * get reclaimed (VI_DOOMED flag) in a forcible downgrade * or unmount. For an unmount, the entire filesystem will be * gone, so we cannot attempt to touch anything associated * with it while the vnode is unlocked; all we can do is * pause briefly and try again. If when we relock the vnode * we discover that it has been reclaimed, updating it is no * longer necessary and we can just return an error. */ vref(vp); VOP_UNLOCK(vp, 0); pause("ffsupd", 1); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); vrele(vp); if ((vp->v_iflag & VI_DOOMED) != 0) return (ENOENT); goto loop; } if (DOINGSOFTDEP(vp)) softdep_update_inodeblock(ip, bp, waitfor); else if (ip->i_effnlink != ip->i_nlink) panic("ffs_update: bad link cnt"); if (I_IS_UFS1(ip)) { *((struct ufs1_dinode *)bp->b_data + ino_to_fsbo(fs, ip->i_number)) = *ip->i_din1; /* XXX: FIX? The entropy here is desirable, but the harvesting may be expensive */ random_harvest_queue(&(ip->i_din1), sizeof(ip->i_din1), 1, RANDOM_FS_ATIME); } else { *((struct ufs2_dinode *)bp->b_data + ino_to_fsbo(fs, ip->i_number)) = *ip->i_din2; /* XXX: FIX? The entropy here is desirable, but the harvesting may be expensive */ random_harvest_queue(&(ip->i_din2), sizeof(ip->i_din2), 1, RANDOM_FS_ATIME); } if (waitfor) error = bwrite(bp); else if (vm_page_count_severe() || buf_dirty_count_severe()) { bawrite(bp); error = 0; } else { if (bp->b_bufsize == fs->fs_bsize) bp->b_flags |= B_CLUSTEROK; bdwrite(bp); error = 0; } return (error); } #define SINGLE 0 /* index of single indirect block */ #define DOUBLE 1 /* index of double indirect block */ #define TRIPLE 2 /* index of triple indirect block */ /* * Truncate the inode ip to at most length size, freeing the * disk blocks. */ int ffs_truncate(vp, length, flags, cred) struct vnode *vp; off_t length; int flags; struct ucred *cred; { struct inode *ip; - ufs2_daddr_t bn, lbn, lastblock, lastiblock[NIADDR], indir_lbn[NIADDR]; - ufs2_daddr_t oldblks[NDADDR + NIADDR], newblks[NDADDR + NIADDR]; + ufs2_daddr_t bn, lbn, lastblock, lastiblock[UFS_NIADDR]; + ufs2_daddr_t indir_lbn[UFS_NIADDR], oldblks[UFS_NDADDR + UFS_NIADDR]; + ufs2_daddr_t newblks[UFS_NDADDR + UFS_NIADDR]; ufs2_daddr_t count, blocksreleased = 0, datablocks, blkno; struct bufobj *bo; struct fs *fs; struct buf *bp; struct ufsmount *ump; int softdeptrunc, journaltrunc; int needextclean, extblocks; int offset, size, level, nblocks; int i, error, allerror, indiroff, waitforupdate; off_t osize; ip = VTOI(vp); ump = VFSTOUFS(vp->v_mount); fs = ump->um_fs; bo = &vp->v_bufobj; ASSERT_VOP_LOCKED(vp, "ffs_truncate"); if (length < 0) return (EINVAL); if (length > fs->fs_maxfilesize) return (EFBIG); #ifdef QUOTA error = getinoquota(ip); if (error) return (error); #endif /* * Historically clients did not have to specify which data * they were truncating. So, if not specified, we assume * traditional behavior, e.g., just the normal data. */ if ((flags & (IO_EXT | IO_NORMAL)) == 0) flags |= IO_NORMAL; if (!DOINGSOFTDEP(vp) && !DOINGASYNC(vp)) flags |= IO_SYNC; waitforupdate = (flags & IO_SYNC) != 0 || !DOINGASYNC(vp); /* * If we are truncating the extended-attributes, and cannot * do it with soft updates, then do it slowly here. If we are * truncating both the extended attributes and the file contents * (e.g., the file is being unlinked), then pick it off with * soft updates below. */ allerror = 0; needextclean = 0; softdeptrunc = 0; journaltrunc = DOINGSUJ(vp); if (journaltrunc == 0 && DOINGSOFTDEP(vp) && length == 0) softdeptrunc = !softdep_slowdown(vp); extblocks = 0; datablocks = DIP(ip, i_blocks); if (fs->fs_magic == FS_UFS2_MAGIC && ip->i_din2->di_extsize > 0) { extblocks = btodb(fragroundup(fs, ip->i_din2->di_extsize)); datablocks -= extblocks; } if ((flags & IO_EXT) && extblocks > 0) { if (length != 0) panic("ffs_truncate: partial trunc of extdata"); if (softdeptrunc || journaltrunc) { if ((flags & IO_NORMAL) == 0) goto extclean; needextclean = 1; } else { if ((error = ffs_syncvnode(vp, MNT_WAIT, 0)) != 0) return (error); #ifdef QUOTA (void) chkdq(ip, -extblocks, NOCRED, 0); #endif vinvalbuf(vp, V_ALT, 0, 0); vn_pages_remove(vp, OFF_TO_IDX(lblktosize(fs, -extblocks)), 0); osize = ip->i_din2->di_extsize; ip->i_din2->di_blocks -= extblocks; ip->i_din2->di_extsize = 0; - for (i = 0; i < NXADDR; i++) { + for (i = 0; i < UFS_NXADDR; i++) { oldblks[i] = ip->i_din2->di_extb[i]; ip->i_din2->di_extb[i] = 0; } ip->i_flag |= IN_CHANGE; if ((error = ffs_update(vp, waitforupdate))) return (error); - for (i = 0; i < NXADDR; i++) { + for (i = 0; i < UFS_NXADDR; i++) { if (oldblks[i] == 0) continue; ffs_blkfree(ump, fs, ITODEVVP(ip), oldblks[i], sblksize(fs, osize, i), ip->i_number, vp->v_type, NULL); } } } if ((flags & IO_NORMAL) == 0) return (0); if (vp->v_type == VLNK && (ip->i_size < vp->v_mount->mnt_maxsymlinklen || datablocks == 0)) { #ifdef INVARIANTS if (length != 0) panic("ffs_truncate: partial truncate of symlink"); #endif bzero(SHORTLINK(ip), (u_int)ip->i_size); ip->i_size = 0; DIP_SET(ip, i_size, 0); ip->i_flag |= IN_CHANGE | IN_UPDATE; if (needextclean) goto extclean; return (ffs_update(vp, waitforupdate)); } if (ip->i_size == length) { ip->i_flag |= IN_CHANGE | IN_UPDATE; if (needextclean) goto extclean; return (ffs_update(vp, 0)); } if (fs->fs_ronly) panic("ffs_truncate: read-only filesystem"); if (IS_SNAPSHOT(ip)) ffs_snapremove(vp); vp->v_lasta = vp->v_clen = vp->v_cstart = vp->v_lastw = 0; osize = ip->i_size; /* * Lengthen the size of the file. We must ensure that the * last byte of the file is allocated. Since the smallest * value of osize is 0, length will be at least 1. */ if (osize < length) { vnode_pager_setsize(vp, length); flags |= BA_CLRBUF; error = UFS_BALLOC(vp, length - 1, 1, cred, flags, &bp); if (error) { vnode_pager_setsize(vp, osize); return (error); } ip->i_size = length; DIP_SET(ip, i_size, length); if (bp->b_bufsize == fs->fs_bsize) bp->b_flags |= B_CLUSTEROK; if (flags & IO_SYNC) bwrite(bp); else if (DOINGASYNC(vp)) bdwrite(bp); else bawrite(bp); ip->i_flag |= IN_CHANGE | IN_UPDATE; return (ffs_update(vp, waitforupdate)); } /* * Lookup block number for a given offset. Zero length files * have no blocks, so return a blkno of -1. */ lbn = lblkno(fs, length - 1); if (length == 0) { blkno = -1; - } else if (lbn < NDADDR) { + } else if (lbn < UFS_NDADDR) { blkno = DIP(ip, i_db[lbn]); } else { error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn), fs->fs_bsize, cred, BA_METAONLY, &bp); if (error) return (error); - indiroff = (lbn - NDADDR) % NINDIR(fs); + indiroff = (lbn - UFS_NDADDR) % NINDIR(fs); if (I_IS_UFS1(ip)) blkno = ((ufs1_daddr_t *)(bp->b_data))[indiroff]; else blkno = ((ufs2_daddr_t *)(bp->b_data))[indiroff]; /* * If the block number is non-zero, then the indirect block * must have been previously allocated and need not be written. * If the block number is zero, then we may have allocated * the indirect block and hence need to write it out. */ if (blkno != 0) brelse(bp); else if (flags & IO_SYNC) bwrite(bp); else bdwrite(bp); } /* * If the block number at the new end of the file is zero, * then we must allocate it to ensure that the last block of * the file is allocated. Soft updates does not handle this * case, so here we have to clean up the soft updates data * structures describing the allocation past the truncation * point. Finding and deallocating those structures is a lot of * work. Since partial truncation with a hole at the end occurs * rarely, we solve the problem by syncing the file so that it * will have no soft updates data structures left. */ if (blkno == 0 && (error = ffs_syncvnode(vp, MNT_WAIT, 0)) != 0) return (error); if (blkno != 0 && DOINGSOFTDEP(vp)) { if (softdeptrunc == 0 && journaltrunc == 0) { /* * If soft updates cannot handle this truncation, * clean up soft dependency data structures and * fall through to the synchronous truncation. */ if ((error = ffs_syncvnode(vp, MNT_WAIT, 0)) != 0) return (error); } else { flags = IO_NORMAL | (needextclean ? IO_EXT: 0); if (journaltrunc) softdep_journal_freeblocks(ip, cred, length, flags); else softdep_setup_freeblocks(ip, length, flags); ASSERT_VOP_LOCKED(vp, "ffs_truncate1"); if (journaltrunc == 0) { ip->i_flag |= IN_CHANGE | IN_UPDATE; error = ffs_update(vp, 0); } return (error); } } /* * Shorten the size of the file. If the last block of the * shortened file is unallocated, we must allocate it. * Additionally, if the file is not being truncated to a * block boundary, the contents of the partial block * following the end of the file must be zero'ed in * case it ever becomes accessible again because of * subsequent file growth. Directories however are not * zero'ed as they should grow back initialized to empty. */ offset = blkoff(fs, length); if (blkno != 0 && offset == 0) { ip->i_size = length; DIP_SET(ip, i_size, length); } else { lbn = lblkno(fs, length); flags |= BA_CLRBUF; error = UFS_BALLOC(vp, length - 1, 1, cred, flags, &bp); if (error) return (error); /* * When we are doing soft updates and the UFS_BALLOC * above fills in a direct block hole with a full sized * block that will be truncated down to a fragment below, * we must flush out the block dependency with an FSYNC * so that we do not get a soft updates inconsistency * when we create the fragment below. */ - if (DOINGSOFTDEP(vp) && lbn < NDADDR && + if (DOINGSOFTDEP(vp) && lbn < UFS_NDADDR && fragroundup(fs, blkoff(fs, length)) < fs->fs_bsize && (error = ffs_syncvnode(vp, MNT_WAIT, 0)) != 0) return (error); ip->i_size = length; DIP_SET(ip, i_size, length); size = blksize(fs, ip, lbn); if (vp->v_type != VDIR && offset != 0) bzero((char *)bp->b_data + offset, (u_int)(size - offset)); /* Kirk's code has reallocbuf(bp, size, 1) here */ allocbuf(bp, size); if (bp->b_bufsize == fs->fs_bsize) bp->b_flags |= B_CLUSTEROK; if (flags & IO_SYNC) bwrite(bp); else if (DOINGASYNC(vp)) bdwrite(bp); else bawrite(bp); } /* * Calculate index into inode's block list of * last direct and indirect blocks (if any) * which we want to keep. Lastblock is -1 when * the file is truncated to 0. */ lastblock = lblkno(fs, length + fs->fs_bsize - 1) - 1; - lastiblock[SINGLE] = lastblock - NDADDR; + lastiblock[SINGLE] = lastblock - UFS_NDADDR; lastiblock[DOUBLE] = lastiblock[SINGLE] - NINDIR(fs); lastiblock[TRIPLE] = lastiblock[DOUBLE] - NINDIR(fs) * NINDIR(fs); nblocks = btodb(fs->fs_bsize); /* * Update file and block pointers on disk before we start freeing * blocks. If we crash before free'ing blocks below, the blocks * will be returned to the free list. lastiblock values are also * normalized to -1 for calls to ffs_indirtrunc below. */ for (level = TRIPLE; level >= SINGLE; level--) { - oldblks[NDADDR + level] = DIP(ip, i_ib[level]); + oldblks[UFS_NDADDR + level] = DIP(ip, i_ib[level]); if (lastiblock[level] < 0) { DIP_SET(ip, i_ib[level], 0); lastiblock[level] = -1; } } - for (i = 0; i < NDADDR; i++) { + for (i = 0; i < UFS_NDADDR; i++) { oldblks[i] = DIP(ip, i_db[i]); if (i > lastblock) DIP_SET(ip, i_db[i], 0); } ip->i_flag |= IN_CHANGE | IN_UPDATE; allerror = ffs_update(vp, waitforupdate); /* * Having written the new inode to disk, save its new configuration * and put back the old block pointers long enough to process them. * Note that we save the new block configuration so we can check it * when we are done. */ - for (i = 0; i < NDADDR; i++) { + for (i = 0; i < UFS_NDADDR; i++) { newblks[i] = DIP(ip, i_db[i]); DIP_SET(ip, i_db[i], oldblks[i]); } - for (i = 0; i < NIADDR; i++) { - newblks[NDADDR + i] = DIP(ip, i_ib[i]); - DIP_SET(ip, i_ib[i], oldblks[NDADDR + i]); + for (i = 0; i < UFS_NIADDR; i++) { + newblks[UFS_NDADDR + i] = DIP(ip, i_ib[i]); + DIP_SET(ip, i_ib[i], oldblks[UFS_NDADDR + i]); } ip->i_size = osize; DIP_SET(ip, i_size, osize); error = vtruncbuf(vp, cred, length, fs->fs_bsize); if (error && (allerror == 0)) allerror = error; /* * Indirect blocks first. */ - indir_lbn[SINGLE] = -NDADDR; + indir_lbn[SINGLE] = -UFS_NDADDR; indir_lbn[DOUBLE] = indir_lbn[SINGLE] - NINDIR(fs) - 1; indir_lbn[TRIPLE] = indir_lbn[DOUBLE] - NINDIR(fs) * NINDIR(fs) - 1; for (level = TRIPLE; level >= SINGLE; level--) { bn = DIP(ip, i_ib[level]); if (bn != 0) { error = ffs_indirtrunc(ip, indir_lbn[level], fsbtodb(fs, bn), lastiblock[level], level, &count); if (error) allerror = error; blocksreleased += count; if (lastiblock[level] < 0) { DIP_SET(ip, i_ib[level], 0); ffs_blkfree(ump, fs, ump->um_devvp, bn, fs->fs_bsize, ip->i_number, vp->v_type, NULL); blocksreleased += nblocks; } } if (lastiblock[level] >= 0) goto done; } /* * All whole direct blocks or frags. */ - for (i = NDADDR - 1; i > lastblock; i--) { + for (i = UFS_NDADDR - 1; i > lastblock; i--) { long bsize; bn = DIP(ip, i_db[i]); if (bn == 0) continue; DIP_SET(ip, i_db[i], 0); bsize = blksize(fs, ip, i); ffs_blkfree(ump, fs, ump->um_devvp, bn, bsize, ip->i_number, vp->v_type, NULL); blocksreleased += btodb(bsize); } if (lastblock < 0) goto done; /* * Finally, look for a change in size of the * last direct block; release any frags. */ bn = DIP(ip, i_db[lastblock]); if (bn != 0) { long oldspace, newspace; /* * Calculate amount of space we're giving * back as old block size minus new block size. */ oldspace = blksize(fs, ip, lastblock); ip->i_size = length; DIP_SET(ip, i_size, length); newspace = blksize(fs, ip, lastblock); if (newspace == 0) panic("ffs_truncate: newspace"); if (oldspace - newspace > 0) { /* * Block number of space to be free'd is * the old block # plus the number of frags * required for the storage we're keeping. */ bn += numfrags(fs, newspace); ffs_blkfree(ump, fs, ump->um_devvp, bn, oldspace - newspace, ip->i_number, vp->v_type, NULL); blocksreleased += btodb(oldspace - newspace); } } done: #ifdef INVARIANTS for (level = SINGLE; level <= TRIPLE; level++) - if (newblks[NDADDR + level] != DIP(ip, i_ib[level])) + if (newblks[UFS_NDADDR + level] != DIP(ip, i_ib[level])) panic("ffs_truncate1"); - for (i = 0; i < NDADDR; i++) + for (i = 0; i < UFS_NDADDR; i++) if (newblks[i] != DIP(ip, i_db[i])) panic("ffs_truncate2"); BO_LOCK(bo); if (length == 0 && (fs->fs_magic != FS_UFS2_MAGIC || ip->i_din2->di_extsize == 0) && (bo->bo_dirty.bv_cnt > 0 || bo->bo_clean.bv_cnt > 0)) panic("ffs_truncate3"); BO_UNLOCK(bo); #endif /* INVARIANTS */ /* * Put back the real size. */ ip->i_size = length; DIP_SET(ip, i_size, length); if (DIP(ip, i_blocks) >= blocksreleased) DIP_SET(ip, i_blocks, DIP(ip, i_blocks) - blocksreleased); else /* sanity */ DIP_SET(ip, i_blocks, 0); ip->i_flag |= IN_CHANGE; #ifdef QUOTA (void) chkdq(ip, -blocksreleased, NOCRED, 0); #endif return (allerror); extclean: if (journaltrunc) softdep_journal_freeblocks(ip, cred, length, IO_EXT); else softdep_setup_freeblocks(ip, length, IO_EXT); return (ffs_update(vp, waitforupdate)); } /* * Release blocks associated with the inode ip and stored in the indirect * block bn. Blocks are free'd in LIFO order up to (but not including) * lastbn. If level is greater than SINGLE, the block is an indirect block * and recursive calls to indirtrunc must be used to cleanse other indirect * blocks. */ static int ffs_indirtrunc(ip, lbn, dbn, lastbn, level, countp) struct inode *ip; ufs2_daddr_t lbn, lastbn; ufs2_daddr_t dbn; int level; ufs2_daddr_t *countp; { struct buf *bp; struct fs *fs; struct vnode *vp; caddr_t copy = NULL; int i, nblocks, error = 0, allerror = 0; ufs2_daddr_t nb, nlbn, last; ufs2_daddr_t blkcount, factor, blocksreleased = 0; ufs1_daddr_t *bap1 = NULL; ufs2_daddr_t *bap2 = NULL; #define BAP(ip, i) (I_IS_UFS1(ip) ? bap1[i] : bap2[i]) fs = ITOFS(ip); /* * Calculate index in current block of last * block to be kept. -1 indicates the entire * block so we need not calculate the index. */ factor = lbn_offset(fs, level); last = lastbn; if (lastbn > 0) last /= factor; nblocks = btodb(fs->fs_bsize); /* * Get buffer of block pointers, zero those entries corresponding * to blocks to be free'd, and update on disk copy first. Since * double(triple) indirect before single(double) indirect, calls * to bmap on these blocks will fail. However, we already have * the on disk address, so we have to set the b_blkno field * explicitly instead of letting bread do everything for us. */ vp = ITOV(ip); bp = getblk(vp, lbn, (int)fs->fs_bsize, 0, 0, 0); if ((bp->b_flags & B_CACHE) == 0) { #ifdef RACCT if (racct_enable) { PROC_LOCK(curproc); racct_add_buf(curproc, bp, 0); PROC_UNLOCK(curproc); } #endif /* RACCT */ curthread->td_ru.ru_inblock++; /* pay for read */ bp->b_iocmd = BIO_READ; bp->b_flags &= ~B_INVAL; bp->b_ioflags &= ~BIO_ERROR; if (bp->b_bcount > bp->b_bufsize) panic("ffs_indirtrunc: bad buffer size"); bp->b_blkno = dbn; vfs_busy_pages(bp, 0); bp->b_iooffset = dbtob(bp->b_blkno); bstrategy(bp); error = bufwait(bp); } if (error) { brelse(bp); *countp = 0; return (error); } if (I_IS_UFS1(ip)) bap1 = (ufs1_daddr_t *)bp->b_data; else bap2 = (ufs2_daddr_t *)bp->b_data; if (lastbn != -1) { copy = malloc(fs->fs_bsize, M_TEMP, M_WAITOK); bcopy((caddr_t)bp->b_data, copy, (u_int)fs->fs_bsize); for (i = last + 1; i < NINDIR(fs); i++) if (I_IS_UFS1(ip)) bap1[i] = 0; else bap2[i] = 0; if (DOINGASYNC(vp)) { bdwrite(bp); } else { error = bwrite(bp); if (error) allerror = error; } if (I_IS_UFS1(ip)) bap1 = (ufs1_daddr_t *)copy; else bap2 = (ufs2_daddr_t *)copy; } /* * Recursively free totally unused blocks. */ for (i = NINDIR(fs) - 1, nlbn = lbn + 1 - i * factor; i > last; i--, nlbn += factor) { nb = BAP(ip, i); if (nb == 0) continue; if (level > SINGLE) { if ((error = ffs_indirtrunc(ip, nlbn, fsbtodb(fs, nb), (ufs2_daddr_t)-1, level - 1, &blkcount)) != 0) allerror = error; blocksreleased += blkcount; } ffs_blkfree(ITOUMP(ip), fs, ITODEVVP(ip), nb, fs->fs_bsize, ip->i_number, vp->v_type, NULL); blocksreleased += nblocks; } /* * Recursively free last partial block. */ if (level > SINGLE && lastbn >= 0) { last = lastbn % factor; nb = BAP(ip, i); if (nb != 0) { error = ffs_indirtrunc(ip, nlbn, fsbtodb(fs, nb), last, level - 1, &blkcount); if (error) allerror = error; blocksreleased += blkcount; } } if (copy != NULL) { free(copy, M_TEMP); } else { bp->b_flags |= B_INVAL | B_NOCACHE; brelse(bp); } *countp = blocksreleased; return (allerror); } int ffs_rdonly(struct inode *ip) { return (ITOFS(ip)->fs_ronly != 0); } Index: head/sys/ufs/ffs/ffs_snapshot.c =================================================================== --- head/sys/ufs/ffs/ffs_snapshot.c (revision 313779) +++ head/sys/ufs/ffs/ffs_snapshot.c (revision 313780) @@ -1,2681 +1,2681 @@ /*- * Copyright 2000 Marshall Kirk McKusick. All Rights Reserved. * * Further information about snapshots can be obtained from: * * Marshall Kirk McKusick http://www.mckusick.com/softdep/ * 1614 Oxford Street mckusick@mckusick.com * Berkeley, CA 94709-1608 +1-510-843-9542 * USA * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY MARSHALL KIRK MCKUSICK ``AS IS'' AND ANY * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL MARSHALL KIRK MCKUSICK BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ffs_snapshot.c 8.11 (McKusick) 7/23/00 */ #include __FBSDID("$FreeBSD$"); #include "opt_quota.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define KERNCRED thread0.td_ucred #define DEBUG 1 #include "opt_ffs.h" #ifdef NO_FFS_SNAPSHOT int ffs_snapshot(mp, snapfile) struct mount *mp; char *snapfile; { return (EINVAL); } int ffs_snapblkfree(fs, devvp, bno, size, inum, vtype, wkhd) struct fs *fs; struct vnode *devvp; ufs2_daddr_t bno; long size; ino_t inum; enum vtype vtype; struct workhead *wkhd; { return (EINVAL); } void ffs_snapremove(vp) struct vnode *vp; { } void ffs_snapshot_mount(mp) struct mount *mp; { } void ffs_snapshot_unmount(mp) struct mount *mp; { } void ffs_snapgone(ip) struct inode *ip; { } int ffs_copyonwrite(devvp, bp) struct vnode *devvp; struct buf *bp; { return (EINVAL); } void ffs_sync_snap(mp, waitfor) struct mount *mp; int waitfor; { } #else FEATURE(ffs_snapshot, "FFS snapshot support"); LIST_HEAD(, snapdata) snapfree; static struct mtx snapfree_lock; MTX_SYSINIT(ffs_snapfree, &snapfree_lock, "snapdata free list", MTX_DEF); static int cgaccount(int, struct vnode *, struct buf *, int); static int expunge_ufs1(struct vnode *, struct inode *, struct fs *, int (*)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, struct fs *, ufs_lbn_t, int), int, int); static int indiracct_ufs1(struct vnode *, struct vnode *, int, ufs1_daddr_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, struct fs *, int (*)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, struct fs *, ufs_lbn_t, int), int); static int fullacct_ufs1(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, struct fs *, ufs_lbn_t, int); static int snapacct_ufs1(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, struct fs *, ufs_lbn_t, int); static int mapacct_ufs1(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, struct fs *, ufs_lbn_t, int); static int expunge_ufs2(struct vnode *, struct inode *, struct fs *, int (*)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, struct fs *, ufs_lbn_t, int), int, int); static int indiracct_ufs2(struct vnode *, struct vnode *, int, ufs2_daddr_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, struct fs *, int (*)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, struct fs *, ufs_lbn_t, int), int); static int fullacct_ufs2(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, struct fs *, ufs_lbn_t, int); static int snapacct_ufs2(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, struct fs *, ufs_lbn_t, int); static int mapacct_ufs2(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, struct fs *, ufs_lbn_t, int); static int readblock(struct vnode *vp, struct buf *, ufs2_daddr_t); static void try_free_snapdata(struct vnode *devvp); static struct snapdata *ffs_snapdata_acquire(struct vnode *devvp); static int ffs_bp_snapblk(struct vnode *, struct buf *); /* * To ensure the consistency of snapshots across crashes, we must * synchronously write out copied blocks before allowing the * originals to be modified. Because of the rather severe speed * penalty that this imposes, the code normally only ensures * persistence for the filesystem metadata contained within a * snapshot. Setting the following flag allows this crash * persistence to be enabled for file contents. */ int dopersistence = 0; #ifdef DEBUG #include SYSCTL_INT(_debug, OID_AUTO, dopersistence, CTLFLAG_RW, &dopersistence, 0, ""); static int snapdebug = 0; SYSCTL_INT(_debug, OID_AUTO, snapdebug, CTLFLAG_RW, &snapdebug, 0, ""); int collectsnapstats = 0; SYSCTL_INT(_debug, OID_AUTO, collectsnapstats, CTLFLAG_RW, &collectsnapstats, 0, ""); #endif /* DEBUG */ /* * Create a snapshot file and initialize it for the filesystem. */ int ffs_snapshot(mp, snapfile) struct mount *mp; char *snapfile; { ufs2_daddr_t numblks, blkno, *blkp, *snapblklist; int error, cg, snaploc; int i, size, len, loc; ufs2_daddr_t blockno; uint64_t flag; struct timespec starttime = {0, 0}, endtime; char saved_nice = 0; long redo = 0, snaplistsize = 0; int32_t *lp; void *space; struct fs *copy_fs = NULL, *fs; struct thread *td = curthread; struct inode *ip, *xp; struct buf *bp, *nbp, *ibp; struct nameidata nd; struct mount *wrtmp; struct vattr vat; struct vnode *vp, *xvp, *mvp, *devvp; struct uio auio; struct iovec aiov; struct snapdata *sn; struct ufsmount *ump; ump = VFSTOUFS(mp); fs = ump->um_fs; sn = NULL; /* * At the moment, journaled soft updates cannot support * taking snapshots. */ if (MOUNTEDSUJ(mp)) { vfs_mount_error(mp, "%s: Snapshots are not yet supported when " "running with journaled soft updates", fs->fs_fsmnt); return (EOPNOTSUPP); } MNT_ILOCK(mp); flag = mp->mnt_flag; MNT_IUNLOCK(mp); /* * Need to serialize access to snapshot code per filesystem. */ /* * Assign a snapshot slot in the superblock. */ UFS_LOCK(ump); for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++) if (fs->fs_snapinum[snaploc] == 0) break; UFS_UNLOCK(ump); if (snaploc == FSMAXSNAP) return (ENOSPC); /* * Create the snapshot file. */ restart: NDINIT(&nd, CREATE, LOCKPARENT | LOCKLEAF | NOCACHE, UIO_SYSSPACE, snapfile, td); if ((error = namei(&nd)) != 0) return (error); if (nd.ni_vp != NULL) { vput(nd.ni_vp); error = EEXIST; } if (nd.ni_dvp->v_mount != mp) error = EXDEV; if (error) { NDFREE(&nd, NDF_ONLY_PNBUF); if (nd.ni_dvp == nd.ni_vp) vrele(nd.ni_dvp); else vput(nd.ni_dvp); return (error); } VATTR_NULL(&vat); vat.va_type = VREG; vat.va_mode = S_IRUSR; vat.va_vaflags |= VA_EXCLUSIVE; if (VOP_GETWRITEMOUNT(nd.ni_dvp, &wrtmp)) wrtmp = NULL; if (wrtmp != mp) panic("ffs_snapshot: mount mismatch"); vfs_rel(wrtmp); if (vn_start_write(NULL, &wrtmp, V_NOWAIT) != 0) { NDFREE(&nd, NDF_ONLY_PNBUF); vput(nd.ni_dvp); if ((error = vn_start_write(NULL, &wrtmp, V_XSLEEP | PCATCH)) != 0) return (error); goto restart; } error = VOP_CREATE(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vat); VOP_UNLOCK(nd.ni_dvp, 0); if (error) { NDFREE(&nd, NDF_ONLY_PNBUF); vn_finished_write(wrtmp); vrele(nd.ni_dvp); return (error); } vp = nd.ni_vp; vp->v_vflag |= VV_SYSTEM; ip = VTOI(vp); devvp = ITODEVVP(ip); /* * Allocate and copy the last block contents so as to be able * to set size to that of the filesystem. */ numblks = howmany(fs->fs_size, fs->fs_frag); error = UFS_BALLOC(vp, lblktosize(fs, (off_t)(numblks - 1)), fs->fs_bsize, KERNCRED, BA_CLRBUF, &bp); if (error) goto out; ip->i_size = lblktosize(fs, (off_t)numblks); DIP_SET(ip, i_size, ip->i_size); ip->i_flag |= IN_CHANGE | IN_UPDATE; error = readblock(vp, bp, numblks - 1); bawrite(bp); if (error != 0) goto out; /* * Preallocate critical data structures so that we can copy * them in without further allocation after we suspend all * operations on the filesystem. We would like to just release * the allocated buffers without writing them since they will * be filled in below once we are ready to go, but this upsets * the soft update code, so we go ahead and write the new buffers. * * Allocate all indirect blocks and mark all of them as not * needing to be copied. */ - for (blkno = NDADDR; blkno < numblks; blkno += NINDIR(fs)) { + for (blkno = UFS_NDADDR; blkno < numblks; blkno += NINDIR(fs)) { error = UFS_BALLOC(vp, lblktosize(fs, (off_t)blkno), fs->fs_bsize, td->td_ucred, BA_METAONLY, &ibp); if (error) goto out; bawrite(ibp); } /* * Allocate copies for the superblock and its summary information. */ error = UFS_BALLOC(vp, fs->fs_sblockloc, fs->fs_sbsize, KERNCRED, 0, &nbp); if (error) goto out; bawrite(nbp); blkno = fragstoblks(fs, fs->fs_csaddr); len = howmany(fs->fs_cssize, fs->fs_bsize); for (loc = 0; loc < len; loc++) { error = UFS_BALLOC(vp, lblktosize(fs, (off_t)(blkno + loc)), fs->fs_bsize, KERNCRED, 0, &nbp); if (error) goto out; bawrite(nbp); } /* * Allocate all cylinder group blocks. */ for (cg = 0; cg < fs->fs_ncg; cg++) { error = UFS_BALLOC(vp, lfragtosize(fs, cgtod(fs, cg)), fs->fs_bsize, KERNCRED, 0, &nbp); if (error) goto out; bawrite(nbp); if (cg % 10 == 0) ffs_syncvnode(vp, MNT_WAIT, 0); } /* * Copy all the cylinder group maps. Although the * filesystem is still active, we hope that only a few * cylinder groups will change between now and when we * suspend operations. Thus, we will be able to quickly * touch up the few cylinder groups that changed during * the suspension period. */ len = howmany(fs->fs_ncg, NBBY); space = malloc(len, M_DEVBUF, M_WAITOK|M_ZERO); UFS_LOCK(ump); fs->fs_active = space; UFS_UNLOCK(ump); for (cg = 0; cg < fs->fs_ncg; cg++) { error = UFS_BALLOC(vp, lfragtosize(fs, cgtod(fs, cg)), fs->fs_bsize, KERNCRED, 0, &nbp); if (error) goto out; error = cgaccount(cg, vp, nbp, 1); bawrite(nbp); if (cg % 10 == 0) ffs_syncvnode(vp, MNT_WAIT, 0); if (error) goto out; } /* * Change inode to snapshot type file. */ ip->i_flags |= SF_SNAPSHOT; DIP_SET(ip, i_flags, ip->i_flags); ip->i_flag |= IN_CHANGE | IN_UPDATE; /* * Ensure that the snapshot is completely on disk. * Since we have marked it as a snapshot it is safe to * unlock it as no process will be allowed to write to it. */ if ((error = ffs_syncvnode(vp, MNT_WAIT, 0)) != 0) goto out; VOP_UNLOCK(vp, 0); /* * All allocations are done, so we can now snapshot the system. * * Recind nice scheduling while running with the filesystem suspended. */ if (td->td_proc->p_nice > 0) { struct proc *p; p = td->td_proc; PROC_LOCK(p); saved_nice = p->p_nice; sched_nice(p, 0); PROC_UNLOCK(p); } /* * Suspend operation on filesystem. */ for (;;) { vn_finished_write(wrtmp); if ((error = vfs_write_suspend(vp->v_mount, 0)) != 0) { vn_start_write(NULL, &wrtmp, V_WAIT); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); goto out; } if (mp->mnt_kern_flag & MNTK_SUSPENDED) break; vn_start_write(NULL, &wrtmp, V_WAIT); } vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); if (ip->i_effnlink == 0) { error = ENOENT; /* Snapshot file unlinked */ goto out1; } if (collectsnapstats) nanotime(&starttime); /* The last block might have changed. Copy it again to be sure. */ error = UFS_BALLOC(vp, lblktosize(fs, (off_t)(numblks - 1)), fs->fs_bsize, KERNCRED, BA_CLRBUF, &bp); if (error != 0) goto out1; error = readblock(vp, bp, numblks - 1); bp->b_flags |= B_VALIDSUSPWRT; bawrite(bp); if (error != 0) goto out1; /* * First, copy all the cylinder group maps that have changed. */ for (cg = 0; cg < fs->fs_ncg; cg++) { if ((ACTIVECGNUM(fs, cg) & ACTIVECGOFF(cg)) != 0) continue; redo++; error = UFS_BALLOC(vp, lfragtosize(fs, cgtod(fs, cg)), fs->fs_bsize, KERNCRED, 0, &nbp); if (error) goto out1; error = cgaccount(cg, vp, nbp, 2); bawrite(nbp); if (error) goto out1; } /* * Grab a copy of the superblock and its summary information. * We delay writing it until the suspension is released below. */ copy_fs = malloc((u_long)fs->fs_bsize, M_UFSMNT, M_WAITOK); bcopy(fs, copy_fs, fs->fs_sbsize); if ((fs->fs_flags & (FS_UNCLEAN | FS_NEEDSFSCK)) == 0) copy_fs->fs_clean = 1; size = fs->fs_bsize < SBLOCKSIZE ? fs->fs_bsize : SBLOCKSIZE; if (fs->fs_sbsize < size) bzero(&((char *)copy_fs)[fs->fs_sbsize], size - fs->fs_sbsize); size = blkroundup(fs, fs->fs_cssize); if (fs->fs_contigsumsize > 0) size += fs->fs_ncg * sizeof(int32_t); space = malloc((u_long)size, M_UFSMNT, M_WAITOK); copy_fs->fs_csp = space; bcopy(fs->fs_csp, copy_fs->fs_csp, fs->fs_cssize); space = (char *)space + fs->fs_cssize; loc = howmany(fs->fs_cssize, fs->fs_fsize); i = fs->fs_frag - loc % fs->fs_frag; len = (i == fs->fs_frag) ? 0 : i * fs->fs_fsize; if (len > 0) { if ((error = bread(devvp, fsbtodb(fs, fs->fs_csaddr + loc), len, KERNCRED, &bp)) != 0) { brelse(bp); free(copy_fs->fs_csp, M_UFSMNT); free(copy_fs, M_UFSMNT); copy_fs = NULL; goto out1; } bcopy(bp->b_data, space, (u_int)len); space = (char *)space + len; bp->b_flags |= B_INVAL | B_NOCACHE; brelse(bp); } if (fs->fs_contigsumsize > 0) { copy_fs->fs_maxcluster = lp = space; for (i = 0; i < fs->fs_ncg; i++) *lp++ = fs->fs_contigsumsize; } /* * We must check for active files that have been unlinked * (e.g., with a zero link count). We have to expunge all * trace of these files from the snapshot so that they are * not reclaimed prematurely by fsck or unnecessarily dumped. * We turn off the MNTK_SUSPENDED flag to avoid a panic from * spec_strategy about writing on a suspended filesystem. * Note that we skip unlinked snapshot files as they will * be handled separately below. * * We also calculate the needed size for the snapshot list. */ snaplistsize = fs->fs_ncg + howmany(fs->fs_cssize, fs->fs_bsize) + FSMAXSNAP + 1 /* superblock */ + 1 /* last block */ + 1 /* size */; MNT_ILOCK(mp); mp->mnt_kern_flag &= ~MNTK_SUSPENDED; MNT_IUNLOCK(mp); loop: MNT_VNODE_FOREACH_ALL(xvp, mp, mvp) { if ((xvp->v_usecount == 0 && (xvp->v_iflag & (VI_OWEINACT | VI_DOINGINACT)) == 0) || xvp->v_type == VNON || IS_SNAPSHOT(VTOI(xvp))) { VI_UNLOCK(xvp); continue; } /* * We can skip parent directory vnode because it must have * this snapshot file in it. */ if (xvp == nd.ni_dvp) { VI_UNLOCK(xvp); continue; } vholdl(xvp); if (vn_lock(xvp, LK_EXCLUSIVE | LK_INTERLOCK) != 0) { MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp); vdrop(xvp); goto loop; } VI_LOCK(xvp); if (xvp->v_usecount == 0 && (xvp->v_iflag & (VI_OWEINACT | VI_DOINGINACT)) == 0) { VI_UNLOCK(xvp); VOP_UNLOCK(xvp, 0); vdrop(xvp); continue; } VI_UNLOCK(xvp); if (snapdebug) vn_printf(xvp, "ffs_snapshot: busy vnode "); if (VOP_GETATTR(xvp, &vat, td->td_ucred) == 0 && vat.va_nlink > 0) { VOP_UNLOCK(xvp, 0); vdrop(xvp); continue; } xp = VTOI(xvp); if (ffs_checkfreefile(copy_fs, vp, xp->i_number)) { VOP_UNLOCK(xvp, 0); vdrop(xvp); continue; } /* * If there is a fragment, clear it here. */ blkno = 0; loc = howmany(xp->i_size, fs->fs_bsize) - 1; - if (loc < NDADDR) { + if (loc < UFS_NDADDR) { len = fragroundup(fs, blkoff(fs, xp->i_size)); if (len != 0 && len < fs->fs_bsize) { ffs_blkfree(ump, copy_fs, vp, DIP(xp, i_db[loc]), len, xp->i_number, xvp->v_type, NULL); blkno = DIP(xp, i_db[loc]); DIP_SET(xp, i_db[loc], 0); } } snaplistsize += 1; if (I_IS_UFS1(xp)) error = expunge_ufs1(vp, xp, copy_fs, fullacct_ufs1, BLK_NOCOPY, 1); else error = expunge_ufs2(vp, xp, copy_fs, fullacct_ufs2, BLK_NOCOPY, 1); if (blkno) DIP_SET(xp, i_db[loc], blkno); if (!error) error = ffs_freefile(ump, copy_fs, vp, xp->i_number, xp->i_mode, NULL); VOP_UNLOCK(xvp, 0); vdrop(xvp); if (error) { free(copy_fs->fs_csp, M_UFSMNT); free(copy_fs, M_UFSMNT); copy_fs = NULL; MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp); goto out1; } } /* * Erase the journal file from the snapshot. */ if (fs->fs_flags & FS_SUJ) { error = softdep_journal_lookup(mp, &xvp); if (error) { free(copy_fs->fs_csp, M_UFSMNT); free(copy_fs, M_UFSMNT); copy_fs = NULL; goto out1; } xp = VTOI(xvp); if (I_IS_UFS1(xp)) error = expunge_ufs1(vp, xp, copy_fs, fullacct_ufs1, BLK_NOCOPY, 0); else error = expunge_ufs2(vp, xp, copy_fs, fullacct_ufs2, BLK_NOCOPY, 0); vput(xvp); } /* * Acquire a lock on the snapdata structure, creating it if necessary. */ sn = ffs_snapdata_acquire(devvp); /* * Change vnode to use shared snapshot lock instead of the original * private lock. */ vp->v_vnlock = &sn->sn_lock; lockmgr(&vp->v_lock, LK_RELEASE, NULL); xp = TAILQ_FIRST(&sn->sn_head); /* * If this is the first snapshot on this filesystem, then we need * to allocate the space for the list of preallocated snapshot blocks. * This list will be refined below, but this preliminary one will * keep us out of deadlock until the full one is ready. */ if (xp == NULL) { snapblklist = malloc(snaplistsize * sizeof(daddr_t), M_UFSMNT, M_WAITOK); blkp = &snapblklist[1]; *blkp++ = lblkno(fs, fs->fs_sblockloc); blkno = fragstoblks(fs, fs->fs_csaddr); for (cg = 0; cg < fs->fs_ncg; cg++) { if (fragstoblks(fs, cgtod(fs, cg) > blkno)) break; *blkp++ = fragstoblks(fs, cgtod(fs, cg)); } len = howmany(fs->fs_cssize, fs->fs_bsize); for (loc = 0; loc < len; loc++) *blkp++ = blkno + loc; for (; cg < fs->fs_ncg; cg++) *blkp++ = fragstoblks(fs, cgtod(fs, cg)); snapblklist[0] = blkp - snapblklist; VI_LOCK(devvp); if (sn->sn_blklist != NULL) panic("ffs_snapshot: non-empty list"); sn->sn_blklist = snapblklist; sn->sn_listsize = blkp - snapblklist; VI_UNLOCK(devvp); } /* * Record snapshot inode. Since this is the newest snapshot, * it must be placed at the end of the list. */ VI_LOCK(devvp); fs->fs_snapinum[snaploc] = ip->i_number; if (ip->i_nextsnap.tqe_prev != 0) panic("ffs_snapshot: %ju already on list", (uintmax_t)ip->i_number); TAILQ_INSERT_TAIL(&sn->sn_head, ip, i_nextsnap); devvp->v_vflag |= VV_COPYONWRITE; VI_UNLOCK(devvp); ASSERT_VOP_LOCKED(vp, "ffs_snapshot vp"); out1: KASSERT((sn != NULL && copy_fs != NULL && error == 0) || (sn == NULL && copy_fs == NULL && error != 0), ("email phk@ and mckusick@")); /* * Resume operation on filesystem. */ vfs_write_resume(vp->v_mount, VR_START_WRITE | VR_NO_SUSPCLR); if (collectsnapstats && starttime.tv_sec > 0) { nanotime(&endtime); timespecsub(&endtime, &starttime); printf("%s: suspended %ld.%03ld sec, redo %ld of %d\n", vp->v_mount->mnt_stat.f_mntonname, (long)endtime.tv_sec, endtime.tv_nsec / 1000000, redo, fs->fs_ncg); } if (copy_fs == NULL) goto out; /* * Copy allocation information from all the snapshots in * this snapshot and then expunge them from its view. */ TAILQ_FOREACH(xp, &sn->sn_head, i_nextsnap) { if (xp == ip) break; if (I_IS_UFS1(xp)) error = expunge_ufs1(vp, xp, fs, snapacct_ufs1, BLK_SNAP, 0); else error = expunge_ufs2(vp, xp, fs, snapacct_ufs2, BLK_SNAP, 0); if (error == 0 && xp->i_effnlink == 0) { error = ffs_freefile(ump, copy_fs, vp, xp->i_number, xp->i_mode, NULL); } if (error) { fs->fs_snapinum[snaploc] = 0; goto done; } } /* * Allocate space for the full list of preallocated snapshot blocks. */ snapblklist = malloc(snaplistsize * sizeof(daddr_t), M_UFSMNT, M_WAITOK); ip->i_snapblklist = &snapblklist[1]; /* * Expunge the blocks used by the snapshots from the set of * blocks marked as used in the snapshot bitmaps. Also, collect * the list of allocated blocks in i_snapblklist. */ if (I_IS_UFS1(ip)) error = expunge_ufs1(vp, ip, copy_fs, mapacct_ufs1, BLK_SNAP, 0); else error = expunge_ufs2(vp, ip, copy_fs, mapacct_ufs2, BLK_SNAP, 0); if (error) { fs->fs_snapinum[snaploc] = 0; free(snapblklist, M_UFSMNT); goto done; } if (snaplistsize < ip->i_snapblklist - snapblklist) panic("ffs_snapshot: list too small"); snaplistsize = ip->i_snapblklist - snapblklist; snapblklist[0] = snaplistsize; ip->i_snapblklist = 0; /* * Write out the list of allocated blocks to the end of the snapshot. */ auio.uio_iov = &aiov; auio.uio_iovcnt = 1; aiov.iov_base = (void *)snapblklist; aiov.iov_len = snaplistsize * sizeof(daddr_t); auio.uio_resid = aiov.iov_len; auio.uio_offset = ip->i_size; auio.uio_segflg = UIO_SYSSPACE; auio.uio_rw = UIO_WRITE; auio.uio_td = td; if ((error = VOP_WRITE(vp, &auio, IO_UNIT, td->td_ucred)) != 0) { fs->fs_snapinum[snaploc] = 0; free(snapblklist, M_UFSMNT); goto done; } /* * Write the superblock and its summary information * to the snapshot. */ blkno = fragstoblks(fs, fs->fs_csaddr); len = howmany(fs->fs_cssize, fs->fs_bsize); space = copy_fs->fs_csp; for (loc = 0; loc < len; loc++) { error = bread(vp, blkno + loc, fs->fs_bsize, KERNCRED, &nbp); if (error) { brelse(nbp); fs->fs_snapinum[snaploc] = 0; free(snapblklist, M_UFSMNT); goto done; } bcopy(space, nbp->b_data, fs->fs_bsize); space = (char *)space + fs->fs_bsize; bawrite(nbp); } error = bread(vp, lblkno(fs, fs->fs_sblockloc), fs->fs_bsize, KERNCRED, &nbp); if (error) { brelse(nbp); } else { loc = blkoff(fs, fs->fs_sblockloc); bcopy((char *)copy_fs, &nbp->b_data[loc], (u_int)fs->fs_sbsize); bawrite(nbp); } /* * As this is the newest list, it is the most inclusive, so * should replace the previous list. */ VI_LOCK(devvp); space = sn->sn_blklist; sn->sn_blklist = snapblklist; sn->sn_listsize = snaplistsize; VI_UNLOCK(devvp); if (space != NULL) free(space, M_UFSMNT); /* * Preallocate all the direct blocks in the snapshot inode so * that we never have to write the inode itself to commit an * update to the contents of the snapshot. Note that once * created, the size of the snapshot will never change, so * there will never be a need to write the inode except to * update the non-integrity-critical time fields and * allocated-block count. */ - for (blockno = 0; blockno < NDADDR; blockno++) { + for (blockno = 0; blockno < UFS_NDADDR; blockno++) { if (DIP(ip, i_db[blockno]) != 0) continue; error = UFS_BALLOC(vp, lblktosize(fs, blockno), fs->fs_bsize, KERNCRED, BA_CLRBUF, &bp); if (error) break; error = readblock(vp, bp, blockno); bawrite(bp); if (error != 0) break; } done: free(copy_fs->fs_csp, M_UFSMNT); free(copy_fs, M_UFSMNT); copy_fs = NULL; out: NDFREE(&nd, NDF_ONLY_PNBUF); if (saved_nice > 0) { struct proc *p; p = td->td_proc; PROC_LOCK(p); sched_nice(td->td_proc, saved_nice); PROC_UNLOCK(td->td_proc); } UFS_LOCK(ump); if (fs->fs_active != 0) { free(fs->fs_active, M_DEVBUF); fs->fs_active = 0; } UFS_UNLOCK(ump); MNT_ILOCK(mp); mp->mnt_flag = (mp->mnt_flag & MNT_QUOTA) | (flag & ~MNT_QUOTA); MNT_IUNLOCK(mp); if (error) (void) ffs_truncate(vp, (off_t)0, 0, NOCRED); (void) ffs_syncvnode(vp, MNT_WAIT, 0); if (error) vput(vp); else VOP_UNLOCK(vp, 0); vrele(nd.ni_dvp); vn_finished_write(wrtmp); process_deferred_inactive(mp); return (error); } /* * Copy a cylinder group map. All the unallocated blocks are marked * BLK_NOCOPY so that the snapshot knows that it need not copy them * if they are later written. If passno is one, then this is a first * pass, so only setting needs to be done. If passno is 2, then this * is a revision to a previous pass which must be undone as the * replacement pass is done. */ static int cgaccount(cg, vp, nbp, passno) int cg; struct vnode *vp; struct buf *nbp; int passno; { struct buf *bp, *ibp; struct inode *ip; struct cg *cgp; struct fs *fs; ufs2_daddr_t base, numblks; int error, len, loc, indiroff; ip = VTOI(vp); fs = ITOFS(ip); error = bread(ITODEVVP(ip), fsbtodb(fs, cgtod(fs, cg)), (int)fs->fs_cgsize, KERNCRED, &bp); if (error) { brelse(bp); return (error); } cgp = (struct cg *)bp->b_data; if (!cg_chkmagic(cgp)) { brelse(bp); return (EIO); } UFS_LOCK(ITOUMP(ip)); ACTIVESET(fs, cg); /* * Recomputation of summary information might not have been performed * at mount time. Sync up summary information for current cylinder * group while data is in memory to ensure that result of background * fsck is slightly more consistent. */ fs->fs_cs(fs, cg) = cgp->cg_cs; UFS_UNLOCK(ITOUMP(ip)); bcopy(bp->b_data, nbp->b_data, fs->fs_cgsize); if (fs->fs_cgsize < fs->fs_bsize) bzero(&nbp->b_data[fs->fs_cgsize], fs->fs_bsize - fs->fs_cgsize); cgp = (struct cg *)nbp->b_data; bqrelse(bp); if (passno == 2) nbp->b_flags |= B_VALIDSUSPWRT; numblks = howmany(fs->fs_size, fs->fs_frag); len = howmany(fs->fs_fpg, fs->fs_frag); base = cgbase(fs, cg) / fs->fs_frag; if (base + len >= numblks) len = numblks - base - 1; loc = 0; - if (base < NDADDR) { - for ( ; loc < NDADDR; loc++) { + if (base < UFS_NDADDR) { + for ( ; loc < UFS_NDADDR; loc++) { if (ffs_isblock(fs, cg_blksfree(cgp), loc)) DIP_SET(ip, i_db[loc], BLK_NOCOPY); else if (passno == 2 && DIP(ip, i_db[loc])== BLK_NOCOPY) DIP_SET(ip, i_db[loc], 0); else if (passno == 1 && DIP(ip, i_db[loc])== BLK_NOCOPY) panic("ffs_snapshot: lost direct block"); } } error = UFS_BALLOC(vp, lblktosize(fs, (off_t)(base + loc)), fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp); if (error) { return (error); } - indiroff = (base + loc - NDADDR) % NINDIR(fs); + indiroff = (base + loc - UFS_NDADDR) % NINDIR(fs); for ( ; loc < len; loc++, indiroff++) { if (indiroff >= NINDIR(fs)) { if (passno == 2) ibp->b_flags |= B_VALIDSUSPWRT; bawrite(ibp); error = UFS_BALLOC(vp, lblktosize(fs, (off_t)(base + loc)), fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp); if (error) { return (error); } indiroff = 0; } if (I_IS_UFS1(ip)) { if (ffs_isblock(fs, cg_blksfree(cgp), loc)) ((ufs1_daddr_t *)(ibp->b_data))[indiroff] = BLK_NOCOPY; else if (passno == 2 && ((ufs1_daddr_t *)(ibp->b_data)) [indiroff] == BLK_NOCOPY) ((ufs1_daddr_t *)(ibp->b_data))[indiroff] = 0; else if (passno == 1 && ((ufs1_daddr_t *)(ibp->b_data)) [indiroff] == BLK_NOCOPY) panic("ffs_snapshot: lost indirect block"); continue; } if (ffs_isblock(fs, cg_blksfree(cgp), loc)) ((ufs2_daddr_t *)(ibp->b_data))[indiroff] = BLK_NOCOPY; else if (passno == 2 && ((ufs2_daddr_t *)(ibp->b_data)) [indiroff] == BLK_NOCOPY) ((ufs2_daddr_t *)(ibp->b_data))[indiroff] = 0; else if (passno == 1 && ((ufs2_daddr_t *)(ibp->b_data)) [indiroff] == BLK_NOCOPY) panic("ffs_snapshot: lost indirect block"); } if (passno == 2) ibp->b_flags |= B_VALIDSUSPWRT; bdwrite(ibp); return (0); } /* * Before expunging a snapshot inode, note all the * blocks that it claims with BLK_SNAP so that fsck will * be able to account for those blocks properly and so * that this snapshot knows that it need not copy them * if the other snapshot holding them is freed. This code * is reproduced once each for UFS1 and UFS2. */ static int expunge_ufs1(snapvp, cancelip, fs, acctfunc, expungetype, clearmode) struct vnode *snapvp; struct inode *cancelip; struct fs *fs; int (*acctfunc)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, struct fs *, ufs_lbn_t, int); int expungetype; int clearmode; { int i, error, indiroff; ufs_lbn_t lbn, rlbn; ufs2_daddr_t len, blkno, numblks, blksperindir; struct ufs1_dinode *dip; struct thread *td = curthread; struct buf *bp; /* * Prepare to expunge the inode. If its inode block has not * yet been copied, then allocate and fill the copy. */ lbn = fragstoblks(fs, ino_to_fsba(fs, cancelip->i_number)); blkno = 0; - if (lbn < NDADDR) { + if (lbn < UFS_NDADDR) { blkno = VTOI(snapvp)->i_din1->di_db[lbn]; } else { if (DOINGSOFTDEP(snapvp)) softdep_prealloc(snapvp, MNT_WAIT); td->td_pflags |= TDP_COWINPROGRESS; error = ffs_balloc_ufs1(snapvp, lblktosize(fs, (off_t)lbn), fs->fs_bsize, KERNCRED, BA_METAONLY, &bp); td->td_pflags &= ~TDP_COWINPROGRESS; if (error) return (error); - indiroff = (lbn - NDADDR) % NINDIR(fs); + indiroff = (lbn - UFS_NDADDR) % NINDIR(fs); blkno = ((ufs1_daddr_t *)(bp->b_data))[indiroff]; bqrelse(bp); } if (blkno != 0) { if ((error = bread(snapvp, lbn, fs->fs_bsize, KERNCRED, &bp))) return (error); } else { error = ffs_balloc_ufs1(snapvp, lblktosize(fs, (off_t)lbn), fs->fs_bsize, KERNCRED, 0, &bp); if (error) return (error); if ((error = readblock(snapvp, bp, lbn)) != 0) return (error); } /* * Set a snapshot inode to be a zero length file, regular files * or unlinked snapshots to be completely unallocated. */ dip = (struct ufs1_dinode *)bp->b_data + ino_to_fsbo(fs, cancelip->i_number); if (clearmode || cancelip->i_effnlink == 0) dip->di_mode = 0; dip->di_size = 0; dip->di_blocks = 0; dip->di_flags &= ~SF_SNAPSHOT; - bzero(&dip->di_db[0], (NDADDR + NIADDR) * sizeof(ufs1_daddr_t)); + bzero(&dip->di_db[0], (UFS_NDADDR + UFS_NIADDR) * sizeof(ufs1_daddr_t)); bdwrite(bp); /* * Now go through and expunge all the blocks in the file * using the function requested. */ numblks = howmany(cancelip->i_size, fs->fs_bsize); if ((error = (*acctfunc)(snapvp, &cancelip->i_din1->di_db[0], - &cancelip->i_din1->di_db[NDADDR], fs, 0, expungetype))) + &cancelip->i_din1->di_db[UFS_NDADDR], fs, 0, expungetype))) return (error); if ((error = (*acctfunc)(snapvp, &cancelip->i_din1->di_ib[0], - &cancelip->i_din1->di_ib[NIADDR], fs, -1, expungetype))) + &cancelip->i_din1->di_ib[UFS_NIADDR], fs, -1, expungetype))) return (error); blksperindir = 1; - lbn = -NDADDR; - len = numblks - NDADDR; - rlbn = NDADDR; - for (i = 0; len > 0 && i < NIADDR; i++) { + lbn = -UFS_NDADDR; + len = numblks - UFS_NDADDR; + rlbn = UFS_NDADDR; + for (i = 0; len > 0 && i < UFS_NIADDR; i++) { error = indiracct_ufs1(snapvp, ITOV(cancelip), i, cancelip->i_din1->di_ib[i], lbn, rlbn, len, blksperindir, fs, acctfunc, expungetype); if (error) return (error); blksperindir *= NINDIR(fs); lbn -= blksperindir + 1; len -= blksperindir; rlbn += blksperindir; } return (0); } /* * Descend an indirect block chain for vnode cancelvp accounting for all * its indirect blocks in snapvp. */ static int indiracct_ufs1(snapvp, cancelvp, level, blkno, lbn, rlbn, remblks, blksperindir, fs, acctfunc, expungetype) struct vnode *snapvp; struct vnode *cancelvp; int level; ufs1_daddr_t blkno; ufs_lbn_t lbn; ufs_lbn_t rlbn; ufs_lbn_t remblks; ufs_lbn_t blksperindir; struct fs *fs; int (*acctfunc)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, struct fs *, ufs_lbn_t, int); int expungetype; { int error, num, i; ufs_lbn_t subblksperindir; - struct indir indirs[NIADDR + 2]; + struct indir indirs[UFS_NIADDR + 2]; ufs1_daddr_t last, *bap; struct buf *bp; if (blkno == 0) { if (expungetype == BLK_NOCOPY) return (0); panic("indiracct_ufs1: missing indir"); } if ((error = ufs_getlbns(cancelvp, rlbn, indirs, &num)) != 0) return (error); if (lbn != indirs[num - 1 - level].in_lbn || num < 2) panic("indiracct_ufs1: botched params"); /* * We have to expand bread here since it will deadlock looking * up the block number for any blocks that are not in the cache. */ bp = getblk(cancelvp, lbn, fs->fs_bsize, 0, 0, 0); bp->b_blkno = fsbtodb(fs, blkno); if ((bp->b_flags & (B_DONE | B_DELWRI)) == 0 && (error = readblock(cancelvp, bp, fragstoblks(fs, blkno)))) { brelse(bp); return (error); } /* * Account for the block pointers in this indirect block. */ last = howmany(remblks, blksperindir); if (last > NINDIR(fs)) last = NINDIR(fs); bap = malloc(fs->fs_bsize, M_DEVBUF, M_WAITOK); bcopy(bp->b_data, (caddr_t)bap, fs->fs_bsize); bqrelse(bp); error = (*acctfunc)(snapvp, &bap[0], &bap[last], fs, level == 0 ? rlbn : -1, expungetype); if (error || level == 0) goto out; /* * Account for the block pointers in each of the indirect blocks * in the levels below us. */ subblksperindir = blksperindir / NINDIR(fs); for (lbn++, level--, i = 0; i < last; i++) { error = indiracct_ufs1(snapvp, cancelvp, level, bap[i], lbn, rlbn, remblks, subblksperindir, fs, acctfunc, expungetype); if (error) goto out; rlbn += blksperindir; lbn -= blksperindir; remblks -= blksperindir; } out: free(bap, M_DEVBUF); return (error); } /* * Do both snap accounting and map accounting. */ static int fullacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, exptype) struct vnode *vp; ufs1_daddr_t *oldblkp, *lastblkp; struct fs *fs; ufs_lbn_t lblkno; int exptype; /* BLK_SNAP or BLK_NOCOPY */ { int error; if ((error = snapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, exptype))) return (error); return (mapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, exptype)); } /* * Identify a set of blocks allocated in a snapshot inode. */ static int snapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, expungetype) struct vnode *vp; ufs1_daddr_t *oldblkp, *lastblkp; struct fs *fs; ufs_lbn_t lblkno; int expungetype; /* BLK_SNAP or BLK_NOCOPY */ { struct inode *ip = VTOI(vp); ufs1_daddr_t blkno, *blkp; ufs_lbn_t lbn; struct buf *ibp; int error; for ( ; oldblkp < lastblkp; oldblkp++) { blkno = *oldblkp; if (blkno == 0 || blkno == BLK_NOCOPY || blkno == BLK_SNAP) continue; lbn = fragstoblks(fs, blkno); - if (lbn < NDADDR) { + if (lbn < UFS_NDADDR) { blkp = &ip->i_din1->di_db[lbn]; ip->i_flag |= IN_CHANGE | IN_UPDATE; } else { error = ffs_balloc_ufs1(vp, lblktosize(fs, (off_t)lbn), fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp); if (error) return (error); blkp = &((ufs1_daddr_t *)(ibp->b_data)) - [(lbn - NDADDR) % NINDIR(fs)]; + [(lbn - UFS_NDADDR) % NINDIR(fs)]; } /* * If we are expunging a snapshot vnode and we * find a block marked BLK_NOCOPY, then it is * one that has been allocated to this snapshot after * we took our current snapshot and can be ignored. */ if (expungetype == BLK_SNAP && *blkp == BLK_NOCOPY) { - if (lbn >= NDADDR) + if (lbn >= UFS_NDADDR) brelse(ibp); } else { if (*blkp != 0) panic("snapacct_ufs1: bad block"); *blkp = expungetype; - if (lbn >= NDADDR) + if (lbn >= UFS_NDADDR) bdwrite(ibp); } } return (0); } /* * Account for a set of blocks allocated in a snapshot inode. */ static int mapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, expungetype) struct vnode *vp; ufs1_daddr_t *oldblkp, *lastblkp; struct fs *fs; ufs_lbn_t lblkno; int expungetype; { ufs1_daddr_t blkno; struct inode *ip; ino_t inum; int acctit; ip = VTOI(vp); inum = ip->i_number; if (lblkno == -1) acctit = 0; else acctit = 1; for ( ; oldblkp < lastblkp; oldblkp++, lblkno++) { blkno = *oldblkp; if (blkno == 0 || blkno == BLK_NOCOPY) continue; if (acctit && expungetype == BLK_SNAP && blkno != BLK_SNAP) *ip->i_snapblklist++ = lblkno; if (blkno == BLK_SNAP) blkno = blkstofrags(fs, lblkno); ffs_blkfree(ITOUMP(ip), fs, vp, blkno, fs->fs_bsize, inum, vp->v_type, NULL); } return (0); } /* * Before expunging a snapshot inode, note all the * blocks that it claims with BLK_SNAP so that fsck will * be able to account for those blocks properly and so * that this snapshot knows that it need not copy them * if the other snapshot holding them is freed. This code * is reproduced once each for UFS1 and UFS2. */ static int expunge_ufs2(snapvp, cancelip, fs, acctfunc, expungetype, clearmode) struct vnode *snapvp; struct inode *cancelip; struct fs *fs; int (*acctfunc)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, struct fs *, ufs_lbn_t, int); int expungetype; int clearmode; { int i, error, indiroff; ufs_lbn_t lbn, rlbn; ufs2_daddr_t len, blkno, numblks, blksperindir; struct ufs2_dinode *dip; struct thread *td = curthread; struct buf *bp; /* * Prepare to expunge the inode. If its inode block has not * yet been copied, then allocate and fill the copy. */ lbn = fragstoblks(fs, ino_to_fsba(fs, cancelip->i_number)); blkno = 0; - if (lbn < NDADDR) { + if (lbn < UFS_NDADDR) { blkno = VTOI(snapvp)->i_din2->di_db[lbn]; } else { if (DOINGSOFTDEP(snapvp)) softdep_prealloc(snapvp, MNT_WAIT); td->td_pflags |= TDP_COWINPROGRESS; error = ffs_balloc_ufs2(snapvp, lblktosize(fs, (off_t)lbn), fs->fs_bsize, KERNCRED, BA_METAONLY, &bp); td->td_pflags &= ~TDP_COWINPROGRESS; if (error) return (error); - indiroff = (lbn - NDADDR) % NINDIR(fs); + indiroff = (lbn - UFS_NDADDR) % NINDIR(fs); blkno = ((ufs2_daddr_t *)(bp->b_data))[indiroff]; bqrelse(bp); } if (blkno != 0) { if ((error = bread(snapvp, lbn, fs->fs_bsize, KERNCRED, &bp))) return (error); } else { error = ffs_balloc_ufs2(snapvp, lblktosize(fs, (off_t)lbn), fs->fs_bsize, KERNCRED, 0, &bp); if (error) return (error); if ((error = readblock(snapvp, bp, lbn)) != 0) return (error); } /* * Set a snapshot inode to be a zero length file, regular files * to be completely unallocated. */ dip = (struct ufs2_dinode *)bp->b_data + ino_to_fsbo(fs, cancelip->i_number); if (clearmode || cancelip->i_effnlink == 0) dip->di_mode = 0; dip->di_size = 0; dip->di_blocks = 0; dip->di_flags &= ~SF_SNAPSHOT; - bzero(&dip->di_db[0], (NDADDR + NIADDR) * sizeof(ufs2_daddr_t)); + bzero(&dip->di_db[0], (UFS_NDADDR + UFS_NIADDR) * sizeof(ufs2_daddr_t)); bdwrite(bp); /* * Now go through and expunge all the blocks in the file * using the function requested. */ numblks = howmany(cancelip->i_size, fs->fs_bsize); if ((error = (*acctfunc)(snapvp, &cancelip->i_din2->di_db[0], - &cancelip->i_din2->di_db[NDADDR], fs, 0, expungetype))) + &cancelip->i_din2->di_db[UFS_NDADDR], fs, 0, expungetype))) return (error); if ((error = (*acctfunc)(snapvp, &cancelip->i_din2->di_ib[0], - &cancelip->i_din2->di_ib[NIADDR], fs, -1, expungetype))) + &cancelip->i_din2->di_ib[UFS_NIADDR], fs, -1, expungetype))) return (error); blksperindir = 1; - lbn = -NDADDR; - len = numblks - NDADDR; - rlbn = NDADDR; - for (i = 0; len > 0 && i < NIADDR; i++) { + lbn = -UFS_NDADDR; + len = numblks - UFS_NDADDR; + rlbn = UFS_NDADDR; + for (i = 0; len > 0 && i < UFS_NIADDR; i++) { error = indiracct_ufs2(snapvp, ITOV(cancelip), i, cancelip->i_din2->di_ib[i], lbn, rlbn, len, blksperindir, fs, acctfunc, expungetype); if (error) return (error); blksperindir *= NINDIR(fs); lbn -= blksperindir + 1; len -= blksperindir; rlbn += blksperindir; } return (0); } /* * Descend an indirect block chain for vnode cancelvp accounting for all * its indirect blocks in snapvp. */ static int indiracct_ufs2(snapvp, cancelvp, level, blkno, lbn, rlbn, remblks, blksperindir, fs, acctfunc, expungetype) struct vnode *snapvp; struct vnode *cancelvp; int level; ufs2_daddr_t blkno; ufs_lbn_t lbn; ufs_lbn_t rlbn; ufs_lbn_t remblks; ufs_lbn_t blksperindir; struct fs *fs; int (*acctfunc)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, struct fs *, ufs_lbn_t, int); int expungetype; { int error, num, i; ufs_lbn_t subblksperindir; - struct indir indirs[NIADDR + 2]; + struct indir indirs[UFS_NIADDR + 2]; ufs2_daddr_t last, *bap; struct buf *bp; if (blkno == 0) { if (expungetype == BLK_NOCOPY) return (0); panic("indiracct_ufs2: missing indir"); } if ((error = ufs_getlbns(cancelvp, rlbn, indirs, &num)) != 0) return (error); if (lbn != indirs[num - 1 - level].in_lbn || num < 2) panic("indiracct_ufs2: botched params"); /* * We have to expand bread here since it will deadlock looking * up the block number for any blocks that are not in the cache. */ bp = getblk(cancelvp, lbn, fs->fs_bsize, 0, 0, 0); bp->b_blkno = fsbtodb(fs, blkno); if ((bp->b_flags & (B_DONE | B_DELWRI)) == 0 && (error = readblock(cancelvp, bp, fragstoblks(fs, blkno)))) { brelse(bp); return (error); } /* * Account for the block pointers in this indirect block. */ last = howmany(remblks, blksperindir); if (last > NINDIR(fs)) last = NINDIR(fs); bap = malloc(fs->fs_bsize, M_DEVBUF, M_WAITOK); bcopy(bp->b_data, (caddr_t)bap, fs->fs_bsize); bqrelse(bp); error = (*acctfunc)(snapvp, &bap[0], &bap[last], fs, level == 0 ? rlbn : -1, expungetype); if (error || level == 0) goto out; /* * Account for the block pointers in each of the indirect blocks * in the levels below us. */ subblksperindir = blksperindir / NINDIR(fs); for (lbn++, level--, i = 0; i < last; i++) { error = indiracct_ufs2(snapvp, cancelvp, level, bap[i], lbn, rlbn, remblks, subblksperindir, fs, acctfunc, expungetype); if (error) goto out; rlbn += blksperindir; lbn -= blksperindir; remblks -= blksperindir; } out: free(bap, M_DEVBUF); return (error); } /* * Do both snap accounting and map accounting. */ static int fullacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, exptype) struct vnode *vp; ufs2_daddr_t *oldblkp, *lastblkp; struct fs *fs; ufs_lbn_t lblkno; int exptype; /* BLK_SNAP or BLK_NOCOPY */ { int error; if ((error = snapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, exptype))) return (error); return (mapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, exptype)); } /* * Identify a set of blocks allocated in a snapshot inode. */ static int snapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, expungetype) struct vnode *vp; ufs2_daddr_t *oldblkp, *lastblkp; struct fs *fs; ufs_lbn_t lblkno; int expungetype; /* BLK_SNAP or BLK_NOCOPY */ { struct inode *ip = VTOI(vp); ufs2_daddr_t blkno, *blkp; ufs_lbn_t lbn; struct buf *ibp; int error; for ( ; oldblkp < lastblkp; oldblkp++) { blkno = *oldblkp; if (blkno == 0 || blkno == BLK_NOCOPY || blkno == BLK_SNAP) continue; lbn = fragstoblks(fs, blkno); - if (lbn < NDADDR) { + if (lbn < UFS_NDADDR) { blkp = &ip->i_din2->di_db[lbn]; ip->i_flag |= IN_CHANGE | IN_UPDATE; } else { error = ffs_balloc_ufs2(vp, lblktosize(fs, (off_t)lbn), fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp); if (error) return (error); blkp = &((ufs2_daddr_t *)(ibp->b_data)) - [(lbn - NDADDR) % NINDIR(fs)]; + [(lbn - UFS_NDADDR) % NINDIR(fs)]; } /* * If we are expunging a snapshot vnode and we * find a block marked BLK_NOCOPY, then it is * one that has been allocated to this snapshot after * we took our current snapshot and can be ignored. */ if (expungetype == BLK_SNAP && *blkp == BLK_NOCOPY) { - if (lbn >= NDADDR) + if (lbn >= UFS_NDADDR) brelse(ibp); } else { if (*blkp != 0) panic("snapacct_ufs2: bad block"); *blkp = expungetype; - if (lbn >= NDADDR) + if (lbn >= UFS_NDADDR) bdwrite(ibp); } } return (0); } /* * Account for a set of blocks allocated in a snapshot inode. */ static int mapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, expungetype) struct vnode *vp; ufs2_daddr_t *oldblkp, *lastblkp; struct fs *fs; ufs_lbn_t lblkno; int expungetype; { ufs2_daddr_t blkno; struct inode *ip; ino_t inum; int acctit; ip = VTOI(vp); inum = ip->i_number; if (lblkno == -1) acctit = 0; else acctit = 1; for ( ; oldblkp < lastblkp; oldblkp++, lblkno++) { blkno = *oldblkp; if (blkno == 0 || blkno == BLK_NOCOPY) continue; if (acctit && expungetype == BLK_SNAP && blkno != BLK_SNAP) *ip->i_snapblklist++ = lblkno; if (blkno == BLK_SNAP) blkno = blkstofrags(fs, lblkno); ffs_blkfree(ITOUMP(ip), fs, vp, blkno, fs->fs_bsize, inum, vp->v_type, NULL); } return (0); } /* * Decrement extra reference on snapshot when last name is removed. * It will not be freed until the last open reference goes away. */ void ffs_snapgone(ip) struct inode *ip; { struct inode *xp; struct fs *fs; int snaploc; struct snapdata *sn; struct ufsmount *ump; /* * Find snapshot in incore list. */ xp = NULL; sn = ITODEVVP(ip)->v_rdev->si_snapdata; if (sn != NULL) TAILQ_FOREACH(xp, &sn->sn_head, i_nextsnap) if (xp == ip) break; if (xp != NULL) vrele(ITOV(ip)); else if (snapdebug) printf("ffs_snapgone: lost snapshot vnode %ju\n", (uintmax_t)ip->i_number); /* * Delete snapshot inode from superblock. Keep list dense. */ ump = ITOUMP(ip); fs = ump->um_fs; UFS_LOCK(ump); for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++) if (fs->fs_snapinum[snaploc] == ip->i_number) break; if (snaploc < FSMAXSNAP) { for (snaploc++; snaploc < FSMAXSNAP; snaploc++) { if (fs->fs_snapinum[snaploc] == 0) break; fs->fs_snapinum[snaploc - 1] = fs->fs_snapinum[snaploc]; } fs->fs_snapinum[snaploc - 1] = 0; } UFS_UNLOCK(ump); } /* * Prepare a snapshot file for being removed. */ void ffs_snapremove(vp) struct vnode *vp; { struct inode *ip; struct vnode *devvp; struct buf *ibp; struct fs *fs; ufs2_daddr_t numblks, blkno, dblk; int error, loc, last; struct snapdata *sn; ip = VTOI(vp); fs = ITOFS(ip); devvp = ITODEVVP(ip); /* * If active, delete from incore list (this snapshot may * already have been in the process of being deleted, so * would not have been active). * * Clear copy-on-write flag if last snapshot. */ VI_LOCK(devvp); if (ip->i_nextsnap.tqe_prev != 0) { sn = devvp->v_rdev->si_snapdata; TAILQ_REMOVE(&sn->sn_head, ip, i_nextsnap); ip->i_nextsnap.tqe_prev = 0; VI_UNLOCK(devvp); lockmgr(&vp->v_lock, LK_EXCLUSIVE, NULL); KASSERT(vp->v_vnlock == &sn->sn_lock, ("ffs_snapremove: lost lock mutation")); vp->v_vnlock = &vp->v_lock; VI_LOCK(devvp); lockmgr(&sn->sn_lock, LK_RELEASE, NULL); try_free_snapdata(devvp); } else VI_UNLOCK(devvp); /* * Clear all BLK_NOCOPY fields. Pass any block claims to other * snapshots that want them (see ffs_snapblkfree below). */ - for (blkno = 1; blkno < NDADDR; blkno++) { + for (blkno = 1; blkno < UFS_NDADDR; blkno++) { dblk = DIP(ip, i_db[blkno]); if (dblk == 0) continue; if (dblk == BLK_NOCOPY || dblk == BLK_SNAP) DIP_SET(ip, i_db[blkno], 0); else if ((dblk == blkstofrags(fs, blkno) && ffs_snapblkfree(fs, ITODEVVP(ip), dblk, fs->fs_bsize, ip->i_number, vp->v_type, NULL))) { DIP_SET(ip, i_blocks, DIP(ip, i_blocks) - btodb(fs->fs_bsize)); DIP_SET(ip, i_db[blkno], 0); } } numblks = howmany(ip->i_size, fs->fs_bsize); - for (blkno = NDADDR; blkno < numblks; blkno += NINDIR(fs)) { + for (blkno = UFS_NDADDR; blkno < numblks; blkno += NINDIR(fs)) { error = UFS_BALLOC(vp, lblktosize(fs, (off_t)blkno), fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp); if (error) continue; if (fs->fs_size - blkno > NINDIR(fs)) last = NINDIR(fs); else last = fs->fs_size - blkno; for (loc = 0; loc < last; loc++) { if (I_IS_UFS1(ip)) { dblk = ((ufs1_daddr_t *)(ibp->b_data))[loc]; if (dblk == 0) continue; if (dblk == BLK_NOCOPY || dblk == BLK_SNAP) ((ufs1_daddr_t *)(ibp->b_data))[loc]= 0; else if ((dblk == blkstofrags(fs, blkno) && ffs_snapblkfree(fs, ITODEVVP(ip), dblk, fs->fs_bsize, ip->i_number, vp->v_type, NULL))) { ip->i_din1->di_blocks -= btodb(fs->fs_bsize); ((ufs1_daddr_t *)(ibp->b_data))[loc]= 0; } continue; } dblk = ((ufs2_daddr_t *)(ibp->b_data))[loc]; if (dblk == 0) continue; if (dblk == BLK_NOCOPY || dblk == BLK_SNAP) ((ufs2_daddr_t *)(ibp->b_data))[loc] = 0; else if ((dblk == blkstofrags(fs, blkno) && ffs_snapblkfree(fs, ITODEVVP(ip), dblk, fs->fs_bsize, ip->i_number, vp->v_type, NULL))) { ip->i_din2->di_blocks -= btodb(fs->fs_bsize); ((ufs2_daddr_t *)(ibp->b_data))[loc] = 0; } } bawrite(ibp); } /* * Clear snapshot flag and drop reference. */ ip->i_flags &= ~SF_SNAPSHOT; DIP_SET(ip, i_flags, ip->i_flags); ip->i_flag |= IN_CHANGE | IN_UPDATE; /* * The dirtied indirects must be written out before * softdep_setup_freeblocks() is called. Otherwise indir_trunc() * may find indirect pointers using the magic BLK_* values. */ if (DOINGSOFTDEP(vp)) ffs_syncvnode(vp, MNT_WAIT, 0); #ifdef QUOTA /* * Reenable disk quotas for ex-snapshot file. */ if (!getinoquota(ip)) (void) chkdq(ip, DIP(ip, i_blocks), KERNCRED, FORCE); #endif } /* * Notification that a block is being freed. Return zero if the free * should be allowed to proceed. Return non-zero if the snapshot file * wants to claim the block. The block will be claimed if it is an * uncopied part of one of the snapshots. It will be freed if it is * either a BLK_NOCOPY or has already been copied in all of the snapshots. * If a fragment is being freed, then all snapshots that care about * it must make a copy since a snapshot file can only claim full sized * blocks. Note that if more than one snapshot file maps the block, * we can pick one at random to claim it. Since none of the snapshots * can change, we are assurred that they will all see the same unmodified * image. When deleting a snapshot file (see ffs_snapremove above), we * must push any of these claimed blocks to one of the other snapshots * that maps it. These claimed blocks are easily identified as they will * have a block number equal to their logical block number within the * snapshot. A copied block can never have this property because they * must always have been allocated from a BLK_NOCOPY location. */ int ffs_snapblkfree(fs, devvp, bno, size, inum, vtype, wkhd) struct fs *fs; struct vnode *devvp; ufs2_daddr_t bno; long size; ino_t inum; enum vtype vtype; struct workhead *wkhd; { struct buf *ibp, *cbp, *savedcbp = NULL; struct thread *td = curthread; struct inode *ip; struct vnode *vp = NULL; ufs_lbn_t lbn; ufs2_daddr_t blkno; int indiroff = 0, error = 0, claimedblk = 0; struct snapdata *sn; lbn = fragstoblks(fs, bno); retry: VI_LOCK(devvp); sn = devvp->v_rdev->si_snapdata; if (sn == NULL) { VI_UNLOCK(devvp); return (0); } if (lockmgr(&sn->sn_lock, LK_INTERLOCK | LK_EXCLUSIVE | LK_SLEEPFAIL, VI_MTX(devvp)) != 0) goto retry; TAILQ_FOREACH(ip, &sn->sn_head, i_nextsnap) { vp = ITOV(ip); if (DOINGSOFTDEP(vp)) softdep_prealloc(vp, MNT_WAIT); /* * Lookup block being written. */ - if (lbn < NDADDR) { + if (lbn < UFS_NDADDR) { blkno = DIP(ip, i_db[lbn]); } else { td->td_pflags |= TDP_COWINPROGRESS; error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn), fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp); td->td_pflags &= ~TDP_COWINPROGRESS; if (error) break; - indiroff = (lbn - NDADDR) % NINDIR(fs); + indiroff = (lbn - UFS_NDADDR) % NINDIR(fs); if (I_IS_UFS1(ip)) blkno=((ufs1_daddr_t *)(ibp->b_data))[indiroff]; else blkno=((ufs2_daddr_t *)(ibp->b_data))[indiroff]; } /* * Check to see if block needs to be copied. */ if (blkno == 0) { /* * A block that we map is being freed. If it has not * been claimed yet, we will claim or copy it (below). */ claimedblk = 1; } else if (blkno == BLK_SNAP) { /* * No previous snapshot claimed the block, * so it will be freed and become a BLK_NOCOPY * (don't care) for us. */ if (claimedblk) panic("snapblkfree: inconsistent block type"); - if (lbn < NDADDR) { + if (lbn < UFS_NDADDR) { DIP_SET(ip, i_db[lbn], BLK_NOCOPY); ip->i_flag |= IN_CHANGE | IN_UPDATE; } else if (I_IS_UFS1(ip)) { ((ufs1_daddr_t *)(ibp->b_data))[indiroff] = BLK_NOCOPY; bdwrite(ibp); } else { ((ufs2_daddr_t *)(ibp->b_data))[indiroff] = BLK_NOCOPY; bdwrite(ibp); } continue; } else /* BLK_NOCOPY or default */ { /* * If the snapshot has already copied the block * (default), or does not care about the block, * it is not needed. */ - if (lbn >= NDADDR) + if (lbn >= UFS_NDADDR) bqrelse(ibp); continue; } /* * If this is a full size block, we will just grab it * and assign it to the snapshot inode. Otherwise we * will proceed to copy it. See explanation for this * routine as to why only a single snapshot needs to * claim this block. */ if (size == fs->fs_bsize) { #ifdef DEBUG if (snapdebug) printf("%s %ju lbn %jd from inum %ju\n", "Grabonremove: snapino", (uintmax_t)ip->i_number, (intmax_t)lbn, (uintmax_t)inum); #endif /* * If journaling is tracking this write we must add * the work to the inode or indirect being written. */ if (wkhd != NULL) { - if (lbn < NDADDR) + if (lbn < UFS_NDADDR) softdep_inode_append(ip, curthread->td_ucred, wkhd); else softdep_buf_append(ibp, wkhd); } - if (lbn < NDADDR) { + if (lbn < UFS_NDADDR) { DIP_SET(ip, i_db[lbn], bno); } else if (I_IS_UFS1(ip)) { ((ufs1_daddr_t *)(ibp->b_data))[indiroff] = bno; bdwrite(ibp); } else { ((ufs2_daddr_t *)(ibp->b_data))[indiroff] = bno; bdwrite(ibp); } DIP_SET(ip, i_blocks, DIP(ip, i_blocks) + btodb(size)); ip->i_flag |= IN_CHANGE | IN_UPDATE; lockmgr(vp->v_vnlock, LK_RELEASE, NULL); return (1); } - if (lbn >= NDADDR) + if (lbn >= UFS_NDADDR) bqrelse(ibp); /* * Allocate the block into which to do the copy. Note that this * allocation will never require any additional allocations for * the snapshot inode. */ td->td_pflags |= TDP_COWINPROGRESS; error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn), fs->fs_bsize, KERNCRED, 0, &cbp); td->td_pflags &= ~TDP_COWINPROGRESS; if (error) break; #ifdef DEBUG if (snapdebug) printf("%s%ju lbn %jd %s %ju size %ld to blkno %jd\n", "Copyonremove: snapino ", (uintmax_t)ip->i_number, (intmax_t)lbn, "for inum", (uintmax_t)inum, size, (intmax_t)cbp->b_blkno); #endif /* * If we have already read the old block contents, then * simply copy them to the new block. Note that we need * to synchronously write snapshots that have not been * unlinked, and hence will be visible after a crash, * to ensure their integrity. At a minimum we ensure the * integrity of the filesystem metadata, but use the * dopersistence sysctl-setable flag to decide on the * persistence needed for file content data. */ if (savedcbp != NULL) { bcopy(savedcbp->b_data, cbp->b_data, fs->fs_bsize); bawrite(cbp); if ((vtype == VDIR || dopersistence) && ip->i_effnlink > 0) (void) ffs_syncvnode(vp, MNT_WAIT, NO_INO_UPDT); continue; } /* * Otherwise, read the old block contents into the buffer. */ if ((error = readblock(vp, cbp, lbn)) != 0) { bzero(cbp->b_data, fs->fs_bsize); bawrite(cbp); if ((vtype == VDIR || dopersistence) && ip->i_effnlink > 0) (void) ffs_syncvnode(vp, MNT_WAIT, NO_INO_UPDT); break; } savedcbp = cbp; } /* * Note that we need to synchronously write snapshots that * have not been unlinked, and hence will be visible after * a crash, to ensure their integrity. At a minimum we * ensure the integrity of the filesystem metadata, but * use the dopersistence sysctl-setable flag to decide on * the persistence needed for file content data. */ if (savedcbp) { vp = savedcbp->b_vp; bawrite(savedcbp); if ((vtype == VDIR || dopersistence) && VTOI(vp)->i_effnlink > 0) (void) ffs_syncvnode(vp, MNT_WAIT, NO_INO_UPDT); } /* * If we have been unable to allocate a block in which to do * the copy, then return non-zero so that the fragment will * not be freed. Although space will be lost, the snapshot * will stay consistent. */ if (error != 0 && wkhd != NULL) softdep_freework(wkhd); lockmgr(vp->v_vnlock, LK_RELEASE, NULL); return (error); } /* * Associate snapshot files when mounting. */ void ffs_snapshot_mount(mp) struct mount *mp; { struct ufsmount *ump = VFSTOUFS(mp); struct vnode *devvp = ump->um_devvp; struct fs *fs = ump->um_fs; struct thread *td = curthread; struct snapdata *sn; struct vnode *vp; struct vnode *lastvp; struct inode *ip; struct uio auio; struct iovec aiov; void *snapblklist; char *reason; daddr_t snaplistsize; int error, snaploc, loc; /* * XXX The following needs to be set before ffs_truncate or * VOP_READ can be called. */ mp->mnt_stat.f_iosize = fs->fs_bsize; /* * Process each snapshot listed in the superblock. */ vp = NULL; lastvp = NULL; sn = NULL; for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++) { if (fs->fs_snapinum[snaploc] == 0) break; if ((error = ffs_vget(mp, fs->fs_snapinum[snaploc], LK_EXCLUSIVE, &vp)) != 0){ printf("ffs_snapshot_mount: vget failed %d\n", error); continue; } ip = VTOI(vp); if (!IS_SNAPSHOT(ip) || ip->i_size == lblktosize(fs, howmany(fs->fs_size, fs->fs_frag))) { if (!IS_SNAPSHOT(ip)) { reason = "non-snapshot"; } else { reason = "old format snapshot"; (void)ffs_truncate(vp, (off_t)0, 0, NOCRED); (void)ffs_syncvnode(vp, MNT_WAIT, 0); } printf("ffs_snapshot_mount: %s inode %d\n", reason, fs->fs_snapinum[snaploc]); vput(vp); vp = NULL; for (loc = snaploc + 1; loc < FSMAXSNAP; loc++) { if (fs->fs_snapinum[loc] == 0) break; fs->fs_snapinum[loc - 1] = fs->fs_snapinum[loc]; } fs->fs_snapinum[loc - 1] = 0; snaploc--; continue; } /* * Acquire a lock on the snapdata structure, creating it if * necessary. */ sn = ffs_snapdata_acquire(devvp); /* * Change vnode to use shared snapshot lock instead of the * original private lock. */ vp->v_vnlock = &sn->sn_lock; lockmgr(&vp->v_lock, LK_RELEASE, NULL); /* * Link it onto the active snapshot list. */ VI_LOCK(devvp); if (ip->i_nextsnap.tqe_prev != 0) panic("ffs_snapshot_mount: %ju already on list", (uintmax_t)ip->i_number); else TAILQ_INSERT_TAIL(&sn->sn_head, ip, i_nextsnap); vp->v_vflag |= VV_SYSTEM; VI_UNLOCK(devvp); VOP_UNLOCK(vp, 0); lastvp = vp; } vp = lastvp; /* * No usable snapshots found. */ if (sn == NULL || vp == NULL) return; /* * Allocate the space for the block hints list. We always want to * use the list from the newest snapshot. */ auio.uio_iov = &aiov; auio.uio_iovcnt = 1; aiov.iov_base = (void *)&snaplistsize; aiov.iov_len = sizeof(snaplistsize); auio.uio_resid = aiov.iov_len; auio.uio_offset = lblktosize(fs, howmany(fs->fs_size, fs->fs_frag)); auio.uio_segflg = UIO_SYSSPACE; auio.uio_rw = UIO_READ; auio.uio_td = td; vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); if ((error = VOP_READ(vp, &auio, IO_UNIT, td->td_ucred)) != 0) { printf("ffs_snapshot_mount: read_1 failed %d\n", error); VOP_UNLOCK(vp, 0); return; } snapblklist = malloc(snaplistsize * sizeof(daddr_t), M_UFSMNT, M_WAITOK); auio.uio_iovcnt = 1; aiov.iov_base = snapblklist; aiov.iov_len = snaplistsize * sizeof (daddr_t); auio.uio_resid = aiov.iov_len; auio.uio_offset -= sizeof(snaplistsize); if ((error = VOP_READ(vp, &auio, IO_UNIT, td->td_ucred)) != 0) { printf("ffs_snapshot_mount: read_2 failed %d\n", error); VOP_UNLOCK(vp, 0); free(snapblklist, M_UFSMNT); return; } VOP_UNLOCK(vp, 0); VI_LOCK(devvp); ASSERT_VOP_LOCKED(devvp, "ffs_snapshot_mount"); sn->sn_listsize = snaplistsize; sn->sn_blklist = (daddr_t *)snapblklist; devvp->v_vflag |= VV_COPYONWRITE; VI_UNLOCK(devvp); } /* * Disassociate snapshot files when unmounting. */ void ffs_snapshot_unmount(mp) struct mount *mp; { struct vnode *devvp = VFSTOUFS(mp)->um_devvp; struct snapdata *sn; struct inode *xp; struct vnode *vp; VI_LOCK(devvp); sn = devvp->v_rdev->si_snapdata; while (sn != NULL && (xp = TAILQ_FIRST(&sn->sn_head)) != NULL) { vp = ITOV(xp); TAILQ_REMOVE(&sn->sn_head, xp, i_nextsnap); xp->i_nextsnap.tqe_prev = 0; lockmgr(&sn->sn_lock, LK_INTERLOCK | LK_EXCLUSIVE, VI_MTX(devvp)); lockmgr(&vp->v_lock, LK_EXCLUSIVE, NULL); KASSERT(vp->v_vnlock == &sn->sn_lock, ("ffs_snapshot_unmount: lost lock mutation")); vp->v_vnlock = &vp->v_lock; lockmgr(&vp->v_lock, LK_RELEASE, NULL); lockmgr(&sn->sn_lock, LK_RELEASE, NULL); if (xp->i_effnlink > 0) vrele(vp); VI_LOCK(devvp); sn = devvp->v_rdev->si_snapdata; } try_free_snapdata(devvp); ASSERT_VOP_LOCKED(devvp, "ffs_snapshot_unmount"); } /* * Check the buffer block to be belong to device buffer that shall be * locked after snaplk. devvp shall be locked on entry, and will be * leaved locked upon exit. */ static int ffs_bp_snapblk(devvp, bp) struct vnode *devvp; struct buf *bp; { struct snapdata *sn; struct fs *fs; ufs2_daddr_t lbn, *snapblklist; int lower, upper, mid; ASSERT_VI_LOCKED(devvp, "ffs_bp_snapblk"); KASSERT(devvp->v_type == VCHR, ("Not a device %p", devvp)); sn = devvp->v_rdev->si_snapdata; if (sn == NULL || TAILQ_FIRST(&sn->sn_head) == NULL) return (0); fs = ITOFS(TAILQ_FIRST(&sn->sn_head)); lbn = fragstoblks(fs, dbtofsb(fs, bp->b_blkno)); snapblklist = sn->sn_blklist; upper = sn->sn_listsize - 1; lower = 1; while (lower <= upper) { mid = (lower + upper) / 2; if (snapblklist[mid] == lbn) break; if (snapblklist[mid] < lbn) lower = mid + 1; else upper = mid - 1; } if (lower <= upper) return (1); return (0); } void ffs_bdflush(bo, bp) struct bufobj *bo; struct buf *bp; { struct thread *td; struct vnode *vp, *devvp; struct buf *nbp; int bp_bdskip; if (bo->bo_dirty.bv_cnt <= dirtybufthresh) return; td = curthread; vp = bp->b_vp; devvp = bo2vnode(bo); KASSERT(vp == devvp, ("devvp != vp %p %p", bo, bp)); VI_LOCK(devvp); bp_bdskip = ffs_bp_snapblk(devvp, bp); if (bp_bdskip) bdwriteskip++; VI_UNLOCK(devvp); if (bo->bo_dirty.bv_cnt > dirtybufthresh + 10 && !bp_bdskip) { (void) VOP_FSYNC(vp, MNT_NOWAIT, td); altbufferflushes++; } else { BO_LOCK(bo); /* * Try to find a buffer to flush. */ TAILQ_FOREACH(nbp, &bo->bo_dirty.bv_hd, b_bobufs) { if ((nbp->b_vflags & BV_BKGRDINPROG) || BUF_LOCK(nbp, LK_EXCLUSIVE | LK_NOWAIT, NULL)) continue; if (bp == nbp) panic("bdwrite: found ourselves"); BO_UNLOCK(bo); /* * Don't countdeps with the bo lock * held. */ if (buf_countdeps(nbp, 0)) { BO_LOCK(bo); BUF_UNLOCK(nbp); continue; } if (bp_bdskip) { VI_LOCK(devvp); if (!ffs_bp_snapblk(vp, nbp)) { VI_UNLOCK(devvp); BO_LOCK(bo); BUF_UNLOCK(nbp); continue; } VI_UNLOCK(devvp); } if (nbp->b_flags & B_CLUSTEROK) { vfs_bio_awrite(nbp); } else { bremfree(nbp); bawrite(nbp); } dirtybufferflushes++; break; } if (nbp == NULL) BO_UNLOCK(bo); } } /* * Check for need to copy block that is about to be written, * copying the block if necessary. */ int ffs_copyonwrite(devvp, bp) struct vnode *devvp; struct buf *bp; { struct snapdata *sn; struct buf *ibp, *cbp, *savedcbp = NULL; struct thread *td = curthread; struct fs *fs; struct inode *ip; struct vnode *vp = NULL; ufs2_daddr_t lbn, blkno, *snapblklist; int lower, upper, mid, indiroff, error = 0; int launched_async_io, prev_norunningbuf; long saved_runningbufspace; if (devvp != bp->b_vp && IS_SNAPSHOT(VTOI(bp->b_vp))) return (0); /* Update on a snapshot file */ if (td->td_pflags & TDP_COWINPROGRESS) panic("ffs_copyonwrite: recursive call"); /* * First check to see if it is in the preallocated list. * By doing this check we avoid several potential deadlocks. */ VI_LOCK(devvp); sn = devvp->v_rdev->si_snapdata; if (sn == NULL || TAILQ_EMPTY(&sn->sn_head)) { VI_UNLOCK(devvp); return (0); /* No snapshot */ } ip = TAILQ_FIRST(&sn->sn_head); fs = ITOFS(ip); lbn = fragstoblks(fs, dbtofsb(fs, bp->b_blkno)); snapblklist = sn->sn_blklist; upper = sn->sn_listsize - 1; lower = 1; while (lower <= upper) { mid = (lower + upper) / 2; if (snapblklist[mid] == lbn) break; if (snapblklist[mid] < lbn) lower = mid + 1; else upper = mid - 1; } if (lower <= upper) { VI_UNLOCK(devvp); return (0); } launched_async_io = 0; prev_norunningbuf = td->td_pflags & TDP_NORUNNINGBUF; /* * Since I/O on bp isn't yet in progress and it may be blocked * for a long time waiting on snaplk, back it out of * runningbufspace, possibly waking other threads waiting for space. */ saved_runningbufspace = bp->b_runningbufspace; if (saved_runningbufspace != 0) runningbufwakeup(bp); /* * Not in the precomputed list, so check the snapshots. */ while (lockmgr(&sn->sn_lock, LK_INTERLOCK | LK_EXCLUSIVE | LK_SLEEPFAIL, VI_MTX(devvp)) != 0) { VI_LOCK(devvp); sn = devvp->v_rdev->si_snapdata; if (sn == NULL || TAILQ_EMPTY(&sn->sn_head)) { VI_UNLOCK(devvp); if (saved_runningbufspace != 0) { bp->b_runningbufspace = saved_runningbufspace; atomic_add_long(&runningbufspace, bp->b_runningbufspace); } return (0); /* Snapshot gone */ } } TAILQ_FOREACH(ip, &sn->sn_head, i_nextsnap) { vp = ITOV(ip); if (DOINGSOFTDEP(vp)) softdep_prealloc(vp, MNT_WAIT); /* * We ensure that everything of our own that needs to be * copied will be done at the time that ffs_snapshot is * called. Thus we can skip the check here which can * deadlock in doing the lookup in UFS_BALLOC. */ if (bp->b_vp == vp) continue; /* * Check to see if block needs to be copied. We do not have * to hold the snapshot lock while doing this lookup as it * will never require any additional allocations for the * snapshot inode. */ - if (lbn < NDADDR) { + if (lbn < UFS_NDADDR) { blkno = DIP(ip, i_db[lbn]); } else { td->td_pflags |= TDP_COWINPROGRESS | TDP_NORUNNINGBUF; error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn), fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp); td->td_pflags &= ~TDP_COWINPROGRESS; if (error) break; - indiroff = (lbn - NDADDR) % NINDIR(fs); + indiroff = (lbn - UFS_NDADDR) % NINDIR(fs); if (I_IS_UFS1(ip)) blkno=((ufs1_daddr_t *)(ibp->b_data))[indiroff]; else blkno=((ufs2_daddr_t *)(ibp->b_data))[indiroff]; bqrelse(ibp); } #ifdef INVARIANTS if (blkno == BLK_SNAP && bp->b_lblkno >= 0) panic("ffs_copyonwrite: bad copy block"); #endif if (blkno != 0) continue; /* * Allocate the block into which to do the copy. Since * multiple processes may all try to copy the same block, * we have to recheck our need to do a copy if we sleep * waiting for the lock. * * Because all snapshots on a filesystem share a single * lock, we ensure that we will never be in competition * with another process to allocate a block. */ td->td_pflags |= TDP_COWINPROGRESS | TDP_NORUNNINGBUF; error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn), fs->fs_bsize, KERNCRED, 0, &cbp); td->td_pflags &= ~TDP_COWINPROGRESS; if (error) break; #ifdef DEBUG if (snapdebug) { printf("Copyonwrite: snapino %ju lbn %jd for ", (uintmax_t)ip->i_number, (intmax_t)lbn); if (bp->b_vp == devvp) printf("fs metadata"); else printf("inum %ju", (uintmax_t)VTOI(bp->b_vp)->i_number); printf(" lblkno %jd to blkno %jd\n", (intmax_t)bp->b_lblkno, (intmax_t)cbp->b_blkno); } #endif /* * If we have already read the old block contents, then * simply copy them to the new block. Note that we need * to synchronously write snapshots that have not been * unlinked, and hence will be visible after a crash, * to ensure their integrity. At a minimum we ensure the * integrity of the filesystem metadata, but use the * dopersistence sysctl-setable flag to decide on the * persistence needed for file content data. */ if (savedcbp != NULL) { bcopy(savedcbp->b_data, cbp->b_data, fs->fs_bsize); bawrite(cbp); if ((devvp == bp->b_vp || bp->b_vp->v_type == VDIR || dopersistence) && ip->i_effnlink > 0) (void) ffs_syncvnode(vp, MNT_WAIT, NO_INO_UPDT); else launched_async_io = 1; continue; } /* * Otherwise, read the old block contents into the buffer. */ if ((error = readblock(vp, cbp, lbn)) != 0) { bzero(cbp->b_data, fs->fs_bsize); bawrite(cbp); if ((devvp == bp->b_vp || bp->b_vp->v_type == VDIR || dopersistence) && ip->i_effnlink > 0) (void) ffs_syncvnode(vp, MNT_WAIT, NO_INO_UPDT); else launched_async_io = 1; break; } savedcbp = cbp; } /* * Note that we need to synchronously write snapshots that * have not been unlinked, and hence will be visible after * a crash, to ensure their integrity. At a minimum we * ensure the integrity of the filesystem metadata, but * use the dopersistence sysctl-setable flag to decide on * the persistence needed for file content data. */ if (savedcbp) { vp = savedcbp->b_vp; bawrite(savedcbp); if ((devvp == bp->b_vp || bp->b_vp->v_type == VDIR || dopersistence) && VTOI(vp)->i_effnlink > 0) (void) ffs_syncvnode(vp, MNT_WAIT, NO_INO_UPDT); else launched_async_io = 1; } lockmgr(vp->v_vnlock, LK_RELEASE, NULL); td->td_pflags = (td->td_pflags & ~TDP_NORUNNINGBUF) | prev_norunningbuf; if (launched_async_io && (td->td_pflags & TDP_NORUNNINGBUF) == 0) waitrunningbufspace(); /* * I/O on bp will now be started, so count it in runningbufspace. */ if (saved_runningbufspace != 0) { bp->b_runningbufspace = saved_runningbufspace; atomic_add_long(&runningbufspace, bp->b_runningbufspace); } return (error); } /* * sync snapshots to force freework records waiting on snapshots to claim * blocks to free. */ void ffs_sync_snap(mp, waitfor) struct mount *mp; int waitfor; { struct snapdata *sn; struct vnode *devvp; struct vnode *vp; struct inode *ip; devvp = VFSTOUFS(mp)->um_devvp; if ((devvp->v_vflag & VV_COPYONWRITE) == 0) return; for (;;) { VI_LOCK(devvp); sn = devvp->v_rdev->si_snapdata; if (sn == NULL) { VI_UNLOCK(devvp); return; } if (lockmgr(&sn->sn_lock, LK_INTERLOCK | LK_EXCLUSIVE | LK_SLEEPFAIL, VI_MTX(devvp)) == 0) break; } TAILQ_FOREACH(ip, &sn->sn_head, i_nextsnap) { vp = ITOV(ip); ffs_syncvnode(vp, waitfor, NO_INO_UPDT); } lockmgr(&sn->sn_lock, LK_RELEASE, NULL); } /* * Read the specified block into the given buffer. * Much of this boiler-plate comes from bwrite(). */ static int readblock(vp, bp, lbn) struct vnode *vp; struct buf *bp; ufs2_daddr_t lbn; { struct inode *ip = VTOI(vp); struct bio *bip; struct fs *fs; ip = VTOI(vp); fs = ITOFS(ip); bip = g_alloc_bio(); bip->bio_cmd = BIO_READ; bip->bio_offset = dbtob(fsbtodb(fs, blkstofrags(fs, lbn))); bip->bio_data = bp->b_data; bip->bio_length = bp->b_bcount; bip->bio_done = NULL; g_io_request(bip, ITODEVVP(ip)->v_bufobj.bo_private); bp->b_error = biowait(bip, "snaprdb"); g_destroy_bio(bip); return (bp->b_error); } #endif /* * Process file deletes that were deferred by ufs_inactive() due to * the file system being suspended. Transfer IN_LAZYACCESS into * IN_MODIFIED for vnodes that were accessed during suspension. */ void process_deferred_inactive(struct mount *mp) { struct vnode *vp, *mvp; struct inode *ip; struct thread *td; int error; td = curthread; (void) vn_start_secondary_write(NULL, &mp, V_WAIT); loop: MNT_VNODE_FOREACH_ALL(vp, mp, mvp) { /* * IN_LAZYACCESS is checked here without holding any * vnode lock, but this flag is set only while holding * vnode interlock. */ if (vp->v_type == VNON || ((VTOI(vp)->i_flag & IN_LAZYACCESS) == 0 && ((vp->v_iflag & VI_OWEINACT) == 0 || vp->v_usecount > 0))) { VI_UNLOCK(vp); continue; } vholdl(vp); error = vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK); if (error != 0) { vdrop(vp); if (error == ENOENT) continue; /* vnode recycled */ MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp); goto loop; } ip = VTOI(vp); if ((ip->i_flag & IN_LAZYACCESS) != 0) { ip->i_flag &= ~IN_LAZYACCESS; ip->i_flag |= IN_MODIFIED; } VI_LOCK(vp); if ((vp->v_iflag & VI_OWEINACT) == 0 || vp->v_usecount > 0) { VI_UNLOCK(vp); VOP_UNLOCK(vp, 0); vdrop(vp); continue; } vinactive(vp, td); VNASSERT((vp->v_iflag & VI_OWEINACT) == 0, vp, ("process_deferred_inactive: got VI_OWEINACT")); VI_UNLOCK(vp); VOP_UNLOCK(vp, 0); vdrop(vp); } vn_finished_secondary_write(mp); } #ifndef NO_FFS_SNAPSHOT static struct snapdata * ffs_snapdata_alloc(void) { struct snapdata *sn; /* * Fetch a snapdata from the free list if there is one available. */ mtx_lock(&snapfree_lock); sn = LIST_FIRST(&snapfree); if (sn != NULL) LIST_REMOVE(sn, sn_link); mtx_unlock(&snapfree_lock); if (sn != NULL) return (sn); /* * If there were no free snapdatas allocate one. */ sn = malloc(sizeof *sn, M_UFSMNT, M_WAITOK | M_ZERO); TAILQ_INIT(&sn->sn_head); lockinit(&sn->sn_lock, PVFS, "snaplk", VLKTIMEOUT, LK_CANRECURSE | LK_NOSHARE); return (sn); } /* * The snapdata is never freed because we can not be certain that * there are no threads sleeping on the snap lock. Persisting * them permanently avoids costly synchronization in ffs_lock(). */ static void ffs_snapdata_free(struct snapdata *sn) { mtx_lock(&snapfree_lock); LIST_INSERT_HEAD(&snapfree, sn, sn_link); mtx_unlock(&snapfree_lock); } /* Try to free snapdata associated with devvp */ static void try_free_snapdata(struct vnode *devvp) { struct snapdata *sn; ufs2_daddr_t *snapblklist; ASSERT_VI_LOCKED(devvp, "try_free_snapdata"); sn = devvp->v_rdev->si_snapdata; if (sn == NULL || TAILQ_FIRST(&sn->sn_head) != NULL || (devvp->v_vflag & VV_COPYONWRITE) == 0) { VI_UNLOCK(devvp); return; } devvp->v_rdev->si_snapdata = NULL; devvp->v_vflag &= ~VV_COPYONWRITE; lockmgr(&sn->sn_lock, LK_DRAIN|LK_INTERLOCK, VI_MTX(devvp)); snapblklist = sn->sn_blklist; sn->sn_blklist = NULL; sn->sn_listsize = 0; lockmgr(&sn->sn_lock, LK_RELEASE, NULL); if (snapblklist != NULL) free(snapblklist, M_UFSMNT); ffs_snapdata_free(sn); } static struct snapdata * ffs_snapdata_acquire(struct vnode *devvp) { struct snapdata *nsn; struct snapdata *sn; /* * Allocate a free snapdata. This is done before acquiring the * devvp lock to avoid allocation while the devvp interlock is * held. */ nsn = ffs_snapdata_alloc(); /* * If there snapshots already exist on this filesystem grab a * reference to the shared lock. Otherwise this is the first * snapshot on this filesystem and we need to use our * pre-allocated snapdata. */ VI_LOCK(devvp); if (devvp->v_rdev->si_snapdata == NULL) { devvp->v_rdev->si_snapdata = nsn; nsn = NULL; } sn = devvp->v_rdev->si_snapdata; /* * Acquire the snapshot lock. */ lockmgr(&sn->sn_lock, LK_INTERLOCK | LK_EXCLUSIVE | LK_RETRY, VI_MTX(devvp)); /* * Free any unused snapdata. */ if (nsn != NULL) ffs_snapdata_free(nsn); return (sn); } #endif Index: head/sys/ufs/ffs/ffs_softdep.c =================================================================== --- head/sys/ufs/ffs/ffs_softdep.c (revision 313779) +++ head/sys/ufs/ffs/ffs_softdep.c (revision 313780) @@ -1,14402 +1,14410 @@ /*- * Copyright 1998, 2000 Marshall Kirk McKusick. * Copyright 2009, 2010 Jeffrey W. Roberson * All rights reserved. * * The soft updates code is derived from the appendix of a University * of Michigan technical report (Gregory R. Ganger and Yale N. Patt, * "Soft Updates: A Solution to the Metadata Update Problem in File * Systems", CSE-TR-254-95, August 1995). * * Further information about soft updates can be obtained from: * * Marshall Kirk McKusick http://www.mckusick.com/softdep/ * 1614 Oxford Street mckusick@mckusick.com * Berkeley, CA 94709-1608 +1-510-843-9542 * USA * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR * TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * from: @(#)ffs_softdep.c 9.59 (McKusick) 6/21/00 */ #include __FBSDID("$FreeBSD$"); #include "opt_ffs.h" #include "opt_quota.h" #include "opt_ddb.h" /* * For now we want the safety net that the DEBUG flag provides. */ #ifndef DEBUG #define DEBUG #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define KTR_SUJ 0 /* Define to KTR_SPARE. */ #ifndef SOFTUPDATES int softdep_flushfiles(oldmnt, flags, td) struct mount *oldmnt; int flags; struct thread *td; { panic("softdep_flushfiles called"); } int softdep_mount(devvp, mp, fs, cred) struct vnode *devvp; struct mount *mp; struct fs *fs; struct ucred *cred; { return (0); } void softdep_initialize() { return; } void softdep_uninitialize() { return; } void softdep_unmount(mp) struct mount *mp; { panic("softdep_unmount called"); } void softdep_setup_sbupdate(ump, fs, bp) struct ufsmount *ump; struct fs *fs; struct buf *bp; { panic("softdep_setup_sbupdate called"); } void softdep_setup_inomapdep(bp, ip, newinum, mode) struct buf *bp; struct inode *ip; ino_t newinum; int mode; { panic("softdep_setup_inomapdep called"); } void softdep_setup_blkmapdep(bp, mp, newblkno, frags, oldfrags) struct buf *bp; struct mount *mp; ufs2_daddr_t newblkno; int frags; int oldfrags; { panic("softdep_setup_blkmapdep called"); } void softdep_setup_allocdirect(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp) struct inode *ip; ufs_lbn_t lbn; ufs2_daddr_t newblkno; ufs2_daddr_t oldblkno; long newsize; long oldsize; struct buf *bp; { panic("softdep_setup_allocdirect called"); } void softdep_setup_allocext(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp) struct inode *ip; ufs_lbn_t lbn; ufs2_daddr_t newblkno; ufs2_daddr_t oldblkno; long newsize; long oldsize; struct buf *bp; { panic("softdep_setup_allocext called"); } void softdep_setup_allocindir_page(ip, lbn, bp, ptrno, newblkno, oldblkno, nbp) struct inode *ip; ufs_lbn_t lbn; struct buf *bp; int ptrno; ufs2_daddr_t newblkno; ufs2_daddr_t oldblkno; struct buf *nbp; { panic("softdep_setup_allocindir_page called"); } void softdep_setup_allocindir_meta(nbp, ip, bp, ptrno, newblkno) struct buf *nbp; struct inode *ip; struct buf *bp; int ptrno; ufs2_daddr_t newblkno; { panic("softdep_setup_allocindir_meta called"); } void softdep_journal_freeblocks(ip, cred, length, flags) struct inode *ip; struct ucred *cred; off_t length; int flags; { panic("softdep_journal_freeblocks called"); } void softdep_journal_fsync(ip) struct inode *ip; { panic("softdep_journal_fsync called"); } void softdep_setup_freeblocks(ip, length, flags) struct inode *ip; off_t length; int flags; { panic("softdep_setup_freeblocks called"); } void softdep_freefile(pvp, ino, mode) struct vnode *pvp; ino_t ino; int mode; { panic("softdep_freefile called"); } int softdep_setup_directory_add(bp, dp, diroffset, newinum, newdirbp, isnewblk) struct buf *bp; struct inode *dp; off_t diroffset; ino_t newinum; struct buf *newdirbp; int isnewblk; { panic("softdep_setup_directory_add called"); } void softdep_change_directoryentry_offset(bp, dp, base, oldloc, newloc, entrysize) struct buf *bp; struct inode *dp; caddr_t base; caddr_t oldloc; caddr_t newloc; int entrysize; { panic("softdep_change_directoryentry_offset called"); } void softdep_setup_remove(bp, dp, ip, isrmdir) struct buf *bp; struct inode *dp; struct inode *ip; int isrmdir; { panic("softdep_setup_remove called"); } void softdep_setup_directory_change(bp, dp, ip, newinum, isrmdir) struct buf *bp; struct inode *dp; struct inode *ip; ino_t newinum; int isrmdir; { panic("softdep_setup_directory_change called"); } void softdep_setup_blkfree(mp, bp, blkno, frags, wkhd) struct mount *mp; struct buf *bp; ufs2_daddr_t blkno; int frags; struct workhead *wkhd; { panic("%s called", __FUNCTION__); } void softdep_setup_inofree(mp, bp, ino, wkhd) struct mount *mp; struct buf *bp; ino_t ino; struct workhead *wkhd; { panic("%s called", __FUNCTION__); } void softdep_setup_unlink(dp, ip) struct inode *dp; struct inode *ip; { panic("%s called", __FUNCTION__); } void softdep_setup_link(dp, ip) struct inode *dp; struct inode *ip; { panic("%s called", __FUNCTION__); } void softdep_revert_link(dp, ip) struct inode *dp; struct inode *ip; { panic("%s called", __FUNCTION__); } void softdep_setup_rmdir(dp, ip) struct inode *dp; struct inode *ip; { panic("%s called", __FUNCTION__); } void softdep_revert_rmdir(dp, ip) struct inode *dp; struct inode *ip; { panic("%s called", __FUNCTION__); } void softdep_setup_create(dp, ip) struct inode *dp; struct inode *ip; { panic("%s called", __FUNCTION__); } void softdep_revert_create(dp, ip) struct inode *dp; struct inode *ip; { panic("%s called", __FUNCTION__); } void softdep_setup_mkdir(dp, ip) struct inode *dp; struct inode *ip; { panic("%s called", __FUNCTION__); } void softdep_revert_mkdir(dp, ip) struct inode *dp; struct inode *ip; { panic("%s called", __FUNCTION__); } void softdep_setup_dotdot_link(dp, ip) struct inode *dp; struct inode *ip; { panic("%s called", __FUNCTION__); } int softdep_prealloc(vp, waitok) struct vnode *vp; int waitok; { panic("%s called", __FUNCTION__); } int softdep_journal_lookup(mp, vpp) struct mount *mp; struct vnode **vpp; { return (ENOENT); } void softdep_change_linkcnt(ip) struct inode *ip; { panic("softdep_change_linkcnt called"); } void softdep_load_inodeblock(ip) struct inode *ip; { panic("softdep_load_inodeblock called"); } void softdep_update_inodeblock(ip, bp, waitfor) struct inode *ip; struct buf *bp; int waitfor; { panic("softdep_update_inodeblock called"); } int softdep_fsync(vp) struct vnode *vp; /* the "in_core" copy of the inode */ { return (0); } void softdep_fsync_mountdev(vp) struct vnode *vp; { return; } int softdep_flushworklist(oldmnt, countp, td) struct mount *oldmnt; int *countp; struct thread *td; { *countp = 0; return (0); } int softdep_sync_metadata(struct vnode *vp) { panic("softdep_sync_metadata called"); } int softdep_sync_buf(struct vnode *vp, struct buf *bp, int waitfor) { panic("softdep_sync_buf called"); } int softdep_slowdown(vp) struct vnode *vp; { panic("softdep_slowdown called"); } int softdep_request_cleanup(fs, vp, cred, resource) struct fs *fs; struct vnode *vp; struct ucred *cred; int resource; { return (0); } int softdep_check_suspend(struct mount *mp, struct vnode *devvp, int softdep_depcnt, int softdep_accdepcnt, int secondary_writes, int secondary_accwrites) { struct bufobj *bo; int error; (void) softdep_depcnt, (void) softdep_accdepcnt; bo = &devvp->v_bufobj; ASSERT_BO_WLOCKED(bo); MNT_ILOCK(mp); while (mp->mnt_secondary_writes != 0) { BO_UNLOCK(bo); msleep(&mp->mnt_secondary_writes, MNT_MTX(mp), (PUSER - 1) | PDROP, "secwr", 0); BO_LOCK(bo); MNT_ILOCK(mp); } /* * Reasons for needing more work before suspend: * - Dirty buffers on devvp. * - Secondary writes occurred after start of vnode sync loop */ error = 0; if (bo->bo_numoutput > 0 || bo->bo_dirty.bv_cnt > 0 || secondary_writes != 0 || mp->mnt_secondary_writes != 0 || secondary_accwrites != mp->mnt_secondary_accwrites) error = EAGAIN; BO_UNLOCK(bo); return (error); } void softdep_get_depcounts(struct mount *mp, int *softdepactivep, int *softdepactiveaccp) { (void) mp; *softdepactivep = 0; *softdepactiveaccp = 0; } void softdep_buf_append(bp, wkhd) struct buf *bp; struct workhead *wkhd; { panic("softdep_buf_appendwork called"); } void softdep_inode_append(ip, cred, wkhd) struct inode *ip; struct ucred *cred; struct workhead *wkhd; { panic("softdep_inode_appendwork called"); } void softdep_freework(wkhd) struct workhead *wkhd; { panic("softdep_freework called"); } #else FEATURE(softupdates, "FFS soft-updates support"); static SYSCTL_NODE(_debug, OID_AUTO, softdep, CTLFLAG_RW, 0, "soft updates stats"); static SYSCTL_NODE(_debug_softdep, OID_AUTO, total, CTLFLAG_RW, 0, "total dependencies allocated"); static SYSCTL_NODE(_debug_softdep, OID_AUTO, highuse, CTLFLAG_RW, 0, "high use dependencies allocated"); static SYSCTL_NODE(_debug_softdep, OID_AUTO, current, CTLFLAG_RW, 0, "current dependencies allocated"); static SYSCTL_NODE(_debug_softdep, OID_AUTO, write, CTLFLAG_RW, 0, "current dependencies written"); unsigned long dep_current[D_LAST + 1]; unsigned long dep_highuse[D_LAST + 1]; unsigned long dep_total[D_LAST + 1]; unsigned long dep_write[D_LAST + 1]; #define SOFTDEP_TYPE(type, str, long) \ static MALLOC_DEFINE(M_ ## type, #str, long); \ SYSCTL_ULONG(_debug_softdep_total, OID_AUTO, str, CTLFLAG_RD, \ &dep_total[D_ ## type], 0, ""); \ SYSCTL_ULONG(_debug_softdep_current, OID_AUTO, str, CTLFLAG_RD, \ &dep_current[D_ ## type], 0, ""); \ SYSCTL_ULONG(_debug_softdep_highuse, OID_AUTO, str, CTLFLAG_RD, \ &dep_highuse[D_ ## type], 0, ""); \ SYSCTL_ULONG(_debug_softdep_write, OID_AUTO, str, CTLFLAG_RD, \ &dep_write[D_ ## type], 0, ""); SOFTDEP_TYPE(PAGEDEP, pagedep, "File page dependencies"); SOFTDEP_TYPE(INODEDEP, inodedep, "Inode dependencies"); SOFTDEP_TYPE(BMSAFEMAP, bmsafemap, "Block or frag allocated from cyl group map"); SOFTDEP_TYPE(NEWBLK, newblk, "New block or frag allocation dependency"); SOFTDEP_TYPE(ALLOCDIRECT, allocdirect, "Block or frag dependency for an inode"); SOFTDEP_TYPE(INDIRDEP, indirdep, "Indirect block dependencies"); SOFTDEP_TYPE(ALLOCINDIR, allocindir, "Block dependency for an indirect block"); SOFTDEP_TYPE(FREEFRAG, freefrag, "Previously used frag for an inode"); SOFTDEP_TYPE(FREEBLKS, freeblks, "Blocks freed from an inode"); SOFTDEP_TYPE(FREEFILE, freefile, "Inode deallocated"); SOFTDEP_TYPE(DIRADD, diradd, "New directory entry"); SOFTDEP_TYPE(MKDIR, mkdir, "New directory"); SOFTDEP_TYPE(DIRREM, dirrem, "Directory entry deleted"); SOFTDEP_TYPE(NEWDIRBLK, newdirblk, "Unclaimed new directory block"); SOFTDEP_TYPE(FREEWORK, freework, "free an inode block"); SOFTDEP_TYPE(FREEDEP, freedep, "track a block free"); SOFTDEP_TYPE(JADDREF, jaddref, "Journal inode ref add"); SOFTDEP_TYPE(JREMREF, jremref, "Journal inode ref remove"); SOFTDEP_TYPE(JMVREF, jmvref, "Journal inode ref move"); SOFTDEP_TYPE(JNEWBLK, jnewblk, "Journal new block"); SOFTDEP_TYPE(JFREEBLK, jfreeblk, "Journal free block"); SOFTDEP_TYPE(JFREEFRAG, jfreefrag, "Journal free frag"); SOFTDEP_TYPE(JSEG, jseg, "Journal segment"); SOFTDEP_TYPE(JSEGDEP, jsegdep, "Journal segment complete"); SOFTDEP_TYPE(SBDEP, sbdep, "Superblock write dependency"); SOFTDEP_TYPE(JTRUNC, jtrunc, "Journal inode truncation"); SOFTDEP_TYPE(JFSYNC, jfsync, "Journal fsync complete"); static MALLOC_DEFINE(M_SENTINEL, "sentinel", "Worklist sentinel"); static MALLOC_DEFINE(M_SAVEDINO, "savedino", "Saved inodes"); static MALLOC_DEFINE(M_JBLOCKS, "jblocks", "Journal block locations"); static MALLOC_DEFINE(M_MOUNTDATA, "softdep", "Softdep per-mount data"); #define M_SOFTDEP_FLAGS (M_WAITOK) /* * translate from workitem type to memory type * MUST match the defines above, such that memtype[D_XXX] == M_XXX */ static struct malloc_type *memtype[] = { M_PAGEDEP, M_INODEDEP, M_BMSAFEMAP, M_NEWBLK, M_ALLOCDIRECT, M_INDIRDEP, M_ALLOCINDIR, M_FREEFRAG, M_FREEBLKS, M_FREEFILE, M_DIRADD, M_MKDIR, M_DIRREM, M_NEWDIRBLK, M_FREEWORK, M_FREEDEP, M_JADDREF, M_JREMREF, M_JMVREF, M_JNEWBLK, M_JFREEBLK, M_JFREEFRAG, M_JSEG, M_JSEGDEP, M_SBDEP, M_JTRUNC, M_JFSYNC, M_SENTINEL }; #define DtoM(type) (memtype[type]) /* * Names of malloc types. */ #define TYPENAME(type) \ ((unsigned)(type) <= D_LAST ? memtype[type]->ks_shortdesc : "???") /* * End system adaptation definitions. */ #define DOTDOT_OFFSET offsetof(struct dirtemplate, dotdot_ino) #define DOT_OFFSET offsetof(struct dirtemplate, dot_ino) /* * Internal function prototypes. */ static void check_clear_deps(struct mount *); static void softdep_error(char *, int); static int softdep_process_worklist(struct mount *, int); static int softdep_waitidle(struct mount *, int); static void drain_output(struct vnode *); static struct buf *getdirtybuf(struct buf *, struct rwlock *, int); static int check_inodedep_free(struct inodedep *); static void clear_remove(struct mount *); static void clear_inodedeps(struct mount *); static void unlinked_inodedep(struct mount *, struct inodedep *); static void clear_unlinked_inodedep(struct inodedep *); static struct inodedep *first_unlinked_inodedep(struct ufsmount *); static int flush_pagedep_deps(struct vnode *, struct mount *, struct diraddhd *); static int free_pagedep(struct pagedep *); static int flush_newblk_dep(struct vnode *, struct mount *, ufs_lbn_t); static int flush_inodedep_deps(struct vnode *, struct mount *, ino_t); static int flush_deplist(struct allocdirectlst *, int, int *); static int sync_cgs(struct mount *, int); static int handle_written_filepage(struct pagedep *, struct buf *, int); static int handle_written_sbdep(struct sbdep *, struct buf *); static void initiate_write_sbdep(struct sbdep *); static void diradd_inode_written(struct diradd *, struct inodedep *); static int handle_written_indirdep(struct indirdep *, struct buf *, struct buf**, int); static int handle_written_inodeblock(struct inodedep *, struct buf *, int); static int jnewblk_rollforward(struct jnewblk *, struct fs *, struct cg *, uint8_t *); static int handle_written_bmsafemap(struct bmsafemap *, struct buf *, int); static void handle_written_jaddref(struct jaddref *); static void handle_written_jremref(struct jremref *); static void handle_written_jseg(struct jseg *, struct buf *); static void handle_written_jnewblk(struct jnewblk *); static void handle_written_jblkdep(struct jblkdep *); static void handle_written_jfreefrag(struct jfreefrag *); static void complete_jseg(struct jseg *); static void complete_jsegs(struct jseg *); static void jseg_write(struct ufsmount *ump, struct jseg *, uint8_t *); static void jaddref_write(struct jaddref *, struct jseg *, uint8_t *); static void jremref_write(struct jremref *, struct jseg *, uint8_t *); static void jmvref_write(struct jmvref *, struct jseg *, uint8_t *); static void jtrunc_write(struct jtrunc *, struct jseg *, uint8_t *); static void jfsync_write(struct jfsync *, struct jseg *, uint8_t *data); static void jnewblk_write(struct jnewblk *, struct jseg *, uint8_t *); static void jfreeblk_write(struct jfreeblk *, struct jseg *, uint8_t *); static void jfreefrag_write(struct jfreefrag *, struct jseg *, uint8_t *); static inline void inoref_write(struct inoref *, struct jseg *, struct jrefrec *); static void handle_allocdirect_partdone(struct allocdirect *, struct workhead *); static struct jnewblk *cancel_newblk(struct newblk *, struct worklist *, struct workhead *); static void indirdep_complete(struct indirdep *); static int indirblk_lookup(struct mount *, ufs2_daddr_t); static void indirblk_insert(struct freework *); static void indirblk_remove(struct freework *); static void handle_allocindir_partdone(struct allocindir *); static void initiate_write_filepage(struct pagedep *, struct buf *); static void initiate_write_indirdep(struct indirdep*, struct buf *); static void handle_written_mkdir(struct mkdir *, int); static int jnewblk_rollback(struct jnewblk *, struct fs *, struct cg *, uint8_t *); static void initiate_write_bmsafemap(struct bmsafemap *, struct buf *); static void initiate_write_inodeblock_ufs1(struct inodedep *, struct buf *); static void initiate_write_inodeblock_ufs2(struct inodedep *, struct buf *); static void handle_workitem_freefile(struct freefile *); static int handle_workitem_remove(struct dirrem *, int); static struct dirrem *newdirrem(struct buf *, struct inode *, struct inode *, int, struct dirrem **); static struct indirdep *indirdep_lookup(struct mount *, struct inode *, struct buf *); static void cancel_indirdep(struct indirdep *, struct buf *, struct freeblks *); static void free_indirdep(struct indirdep *); static void free_diradd(struct diradd *, struct workhead *); static void merge_diradd(struct inodedep *, struct diradd *); static void complete_diradd(struct diradd *); static struct diradd *diradd_lookup(struct pagedep *, int); static struct jremref *cancel_diradd_dotdot(struct inode *, struct dirrem *, struct jremref *); static struct jremref *cancel_mkdir_dotdot(struct inode *, struct dirrem *, struct jremref *); static void cancel_diradd(struct diradd *, struct dirrem *, struct jremref *, struct jremref *, struct jremref *); static void dirrem_journal(struct dirrem *, struct jremref *, struct jremref *, struct jremref *); static void cancel_allocindir(struct allocindir *, struct buf *bp, struct freeblks *, int); static int setup_trunc_indir(struct freeblks *, struct inode *, ufs_lbn_t, ufs_lbn_t, ufs2_daddr_t); static void complete_trunc_indir(struct freework *); static void trunc_indirdep(struct indirdep *, struct freeblks *, struct buf *, int); static void complete_mkdir(struct mkdir *); static void free_newdirblk(struct newdirblk *); static void free_jremref(struct jremref *); static void free_jaddref(struct jaddref *); static void free_jsegdep(struct jsegdep *); static void free_jsegs(struct jblocks *); static void rele_jseg(struct jseg *); static void free_jseg(struct jseg *, struct jblocks *); static void free_jnewblk(struct jnewblk *); static void free_jblkdep(struct jblkdep *); static void free_jfreefrag(struct jfreefrag *); static void free_freedep(struct freedep *); static void journal_jremref(struct dirrem *, struct jremref *, struct inodedep *); static void cancel_jnewblk(struct jnewblk *, struct workhead *); static int cancel_jaddref(struct jaddref *, struct inodedep *, struct workhead *); static void cancel_jfreefrag(struct jfreefrag *); static inline void setup_freedirect(struct freeblks *, struct inode *, int, int); static inline void setup_freeext(struct freeblks *, struct inode *, int, int); static inline void setup_freeindir(struct freeblks *, struct inode *, int, ufs_lbn_t, int); static inline struct freeblks *newfreeblks(struct mount *, struct inode *); static void freeblks_free(struct ufsmount *, struct freeblks *, int); static void indir_trunc(struct freework *, ufs2_daddr_t, ufs_lbn_t); static ufs2_daddr_t blkcount(struct fs *, ufs2_daddr_t, off_t); static int trunc_check_buf(struct buf *, int *, ufs_lbn_t, int, int); static void trunc_dependencies(struct inode *, struct freeblks *, ufs_lbn_t, int, int); static void trunc_pages(struct inode *, off_t, ufs2_daddr_t, int); static int cancel_pagedep(struct pagedep *, struct freeblks *, int); static int deallocate_dependencies(struct buf *, struct freeblks *, int); static void newblk_freefrag(struct newblk*); static void free_newblk(struct newblk *); static void cancel_allocdirect(struct allocdirectlst *, struct allocdirect *, struct freeblks *); static int check_inode_unwritten(struct inodedep *); static int free_inodedep(struct inodedep *); static void freework_freeblock(struct freework *); static void freework_enqueue(struct freework *); static int handle_workitem_freeblocks(struct freeblks *, int); static int handle_complete_freeblocks(struct freeblks *, int); static void handle_workitem_indirblk(struct freework *); static void handle_written_freework(struct freework *); static void merge_inode_lists(struct allocdirectlst *,struct allocdirectlst *); static struct worklist *jnewblk_merge(struct worklist *, struct worklist *, struct workhead *); static struct freefrag *setup_allocindir_phase2(struct buf *, struct inode *, struct inodedep *, struct allocindir *, ufs_lbn_t); static struct allocindir *newallocindir(struct inode *, int, ufs2_daddr_t, ufs2_daddr_t, ufs_lbn_t); static void handle_workitem_freefrag(struct freefrag *); static struct freefrag *newfreefrag(struct inode *, ufs2_daddr_t, long, ufs_lbn_t); static void allocdirect_merge(struct allocdirectlst *, struct allocdirect *, struct allocdirect *); static struct freefrag *allocindir_merge(struct allocindir *, struct allocindir *); static int bmsafemap_find(struct bmsafemap_hashhead *, int, struct bmsafemap **); static struct bmsafemap *bmsafemap_lookup(struct mount *, struct buf *, int cg, struct bmsafemap *); static int newblk_find(struct newblk_hashhead *, ufs2_daddr_t, int, struct newblk **); static int newblk_lookup(struct mount *, ufs2_daddr_t, int, struct newblk **); static int inodedep_find(struct inodedep_hashhead *, ino_t, struct inodedep **); static int inodedep_lookup(struct mount *, ino_t, int, struct inodedep **); static int pagedep_lookup(struct mount *, struct buf *bp, ino_t, ufs_lbn_t, int, struct pagedep **); static int pagedep_find(struct pagedep_hashhead *, ino_t, ufs_lbn_t, struct pagedep **); static void pause_timer(void *); static int request_cleanup(struct mount *, int); static void schedule_cleanup(struct mount *); static void softdep_ast_cleanup_proc(void); static int process_worklist_item(struct mount *, int, int); static void process_removes(struct vnode *); static void process_truncates(struct vnode *); static void jwork_move(struct workhead *, struct workhead *); static void jwork_insert(struct workhead *, struct jsegdep *); static void add_to_worklist(struct worklist *, int); static void wake_worklist(struct worklist *); static void wait_worklist(struct worklist *, char *); static void remove_from_worklist(struct worklist *); static void softdep_flush(void *); static void softdep_flushjournal(struct mount *); static int softdep_speedup(struct ufsmount *); static void worklist_speedup(struct mount *); static int journal_mount(struct mount *, struct fs *, struct ucred *); static void journal_unmount(struct ufsmount *); static int journal_space(struct ufsmount *, int); static void journal_suspend(struct ufsmount *); static int journal_unsuspend(struct ufsmount *ump); static void softdep_prelink(struct vnode *, struct vnode *); static void add_to_journal(struct worklist *); static void remove_from_journal(struct worklist *); static bool softdep_excess_items(struct ufsmount *, int); static void softdep_process_journal(struct mount *, struct worklist *, int); static struct jremref *newjremref(struct dirrem *, struct inode *, struct inode *ip, off_t, nlink_t); static struct jaddref *newjaddref(struct inode *, ino_t, off_t, int16_t, uint16_t); static inline void newinoref(struct inoref *, ino_t, ino_t, off_t, nlink_t, uint16_t); static inline struct jsegdep *inoref_jseg(struct inoref *); static struct jmvref *newjmvref(struct inode *, ino_t, off_t, off_t); static struct jfreeblk *newjfreeblk(struct freeblks *, ufs_lbn_t, ufs2_daddr_t, int); static void adjust_newfreework(struct freeblks *, int); static struct jtrunc *newjtrunc(struct freeblks *, off_t, int); static void move_newblock_dep(struct jaddref *, struct inodedep *); static void cancel_jfreeblk(struct freeblks *, ufs2_daddr_t); static struct jfreefrag *newjfreefrag(struct freefrag *, struct inode *, ufs2_daddr_t, long, ufs_lbn_t); static struct freework *newfreework(struct ufsmount *, struct freeblks *, struct freework *, ufs_lbn_t, ufs2_daddr_t, int, int, int); static int jwait(struct worklist *, int); static struct inodedep *inodedep_lookup_ip(struct inode *); static int bmsafemap_backgroundwrite(struct bmsafemap *, struct buf *); static struct freefile *handle_bufwait(struct inodedep *, struct workhead *); static void handle_jwork(struct workhead *); static struct mkdir *setup_newdir(struct diradd *, ino_t, ino_t, struct buf *, struct mkdir **); static struct jblocks *jblocks_create(void); static ufs2_daddr_t jblocks_alloc(struct jblocks *, int, int *); static void jblocks_free(struct jblocks *, struct mount *, int); static void jblocks_destroy(struct jblocks *); static void jblocks_add(struct jblocks *, ufs2_daddr_t, int); /* * Exported softdep operations. */ static void softdep_disk_io_initiation(struct buf *); static void softdep_disk_write_complete(struct buf *); static void softdep_deallocate_dependencies(struct buf *); static int softdep_count_dependencies(struct buf *bp, int); /* * Global lock over all of soft updates. */ static struct mtx lk; MTX_SYSINIT(softdep_lock, &lk, "Global Softdep Lock", MTX_DEF); #define ACQUIRE_GBLLOCK(lk) mtx_lock(lk) #define FREE_GBLLOCK(lk) mtx_unlock(lk) #define GBLLOCK_OWNED(lk) mtx_assert((lk), MA_OWNED) /* * Per-filesystem soft-updates locking. */ #define LOCK_PTR(ump) (&(ump)->um_softdep->sd_fslock) #define TRY_ACQUIRE_LOCK(ump) rw_try_wlock(&(ump)->um_softdep->sd_fslock) #define ACQUIRE_LOCK(ump) rw_wlock(&(ump)->um_softdep->sd_fslock) #define FREE_LOCK(ump) rw_wunlock(&(ump)->um_softdep->sd_fslock) #define LOCK_OWNED(ump) rw_assert(&(ump)->um_softdep->sd_fslock, \ RA_WLOCKED) #define BUF_AREC(bp) lockallowrecurse(&(bp)->b_lock) #define BUF_NOREC(bp) lockdisablerecurse(&(bp)->b_lock) /* * Worklist queue management. * These routines require that the lock be held. */ #ifndef /* NOT */ DEBUG #define WORKLIST_INSERT(head, item) do { \ (item)->wk_state |= ONWORKLIST; \ LIST_INSERT_HEAD(head, item, wk_list); \ } while (0) #define WORKLIST_REMOVE(item) do { \ (item)->wk_state &= ~ONWORKLIST; \ LIST_REMOVE(item, wk_list); \ } while (0) #define WORKLIST_INSERT_UNLOCKED WORKLIST_INSERT #define WORKLIST_REMOVE_UNLOCKED WORKLIST_REMOVE #else /* DEBUG */ static void worklist_insert(struct workhead *, struct worklist *, int); static void worklist_remove(struct worklist *, int); #define WORKLIST_INSERT(head, item) worklist_insert(head, item, 1) #define WORKLIST_INSERT_UNLOCKED(head, item) worklist_insert(head, item, 0) #define WORKLIST_REMOVE(item) worklist_remove(item, 1) #define WORKLIST_REMOVE_UNLOCKED(item) worklist_remove(item, 0) static void worklist_insert(head, item, locked) struct workhead *head; struct worklist *item; int locked; { if (locked) LOCK_OWNED(VFSTOUFS(item->wk_mp)); if (item->wk_state & ONWORKLIST) panic("worklist_insert: %p %s(0x%X) already on list", item, TYPENAME(item->wk_type), item->wk_state); item->wk_state |= ONWORKLIST; LIST_INSERT_HEAD(head, item, wk_list); } static void worklist_remove(item, locked) struct worklist *item; int locked; { if (locked) LOCK_OWNED(VFSTOUFS(item->wk_mp)); if ((item->wk_state & ONWORKLIST) == 0) panic("worklist_remove: %p %s(0x%X) not on list", item, TYPENAME(item->wk_type), item->wk_state); item->wk_state &= ~ONWORKLIST; LIST_REMOVE(item, wk_list); } #endif /* DEBUG */ /* * Merge two jsegdeps keeping only the oldest one as newer references * can't be discarded until after older references. */ static inline struct jsegdep * jsegdep_merge(struct jsegdep *one, struct jsegdep *two) { struct jsegdep *swp; if (two == NULL) return (one); if (one->jd_seg->js_seq > two->jd_seg->js_seq) { swp = one; one = two; two = swp; } WORKLIST_REMOVE(&two->jd_list); free_jsegdep(two); return (one); } /* * If two freedeps are compatible free one to reduce list size. */ static inline struct freedep * freedep_merge(struct freedep *one, struct freedep *two) { if (two == NULL) return (one); if (one->fd_freework == two->fd_freework) { WORKLIST_REMOVE(&two->fd_list); free_freedep(two); } return (one); } /* * Move journal work from one list to another. Duplicate freedeps and * jsegdeps are coalesced to keep the lists as small as possible. */ static void jwork_move(dst, src) struct workhead *dst; struct workhead *src; { struct freedep *freedep; struct jsegdep *jsegdep; struct worklist *wkn; struct worklist *wk; KASSERT(dst != src, ("jwork_move: dst == src")); freedep = NULL; jsegdep = NULL; LIST_FOREACH_SAFE(wk, dst, wk_list, wkn) { if (wk->wk_type == D_JSEGDEP) jsegdep = jsegdep_merge(WK_JSEGDEP(wk), jsegdep); else if (wk->wk_type == D_FREEDEP) freedep = freedep_merge(WK_FREEDEP(wk), freedep); } while ((wk = LIST_FIRST(src)) != NULL) { WORKLIST_REMOVE(wk); WORKLIST_INSERT(dst, wk); if (wk->wk_type == D_JSEGDEP) { jsegdep = jsegdep_merge(WK_JSEGDEP(wk), jsegdep); continue; } if (wk->wk_type == D_FREEDEP) freedep = freedep_merge(WK_FREEDEP(wk), freedep); } } static void jwork_insert(dst, jsegdep) struct workhead *dst; struct jsegdep *jsegdep; { struct jsegdep *jsegdepn; struct worklist *wk; LIST_FOREACH(wk, dst, wk_list) if (wk->wk_type == D_JSEGDEP) break; if (wk == NULL) { WORKLIST_INSERT(dst, &jsegdep->jd_list); return; } jsegdepn = WK_JSEGDEP(wk); if (jsegdep->jd_seg->js_seq < jsegdepn->jd_seg->js_seq) { WORKLIST_REMOVE(wk); free_jsegdep(jsegdepn); WORKLIST_INSERT(dst, &jsegdep->jd_list); } else free_jsegdep(jsegdep); } /* * Routines for tracking and managing workitems. */ static void workitem_free(struct worklist *, int); static void workitem_alloc(struct worklist *, int, struct mount *); static void workitem_reassign(struct worklist *, int); #define WORKITEM_FREE(item, type) \ workitem_free((struct worklist *)(item), (type)) #define WORKITEM_REASSIGN(item, type) \ workitem_reassign((struct worklist *)(item), (type)) static void workitem_free(item, type) struct worklist *item; int type; { struct ufsmount *ump; #ifdef DEBUG if (item->wk_state & ONWORKLIST) panic("workitem_free: %s(0x%X) still on list", TYPENAME(item->wk_type), item->wk_state); if (item->wk_type != type && type != D_NEWBLK) panic("workitem_free: type mismatch %s != %s", TYPENAME(item->wk_type), TYPENAME(type)); #endif if (item->wk_state & IOWAITING) wakeup(item); ump = VFSTOUFS(item->wk_mp); LOCK_OWNED(ump); KASSERT(ump->softdep_deps > 0, ("workitem_free: %s: softdep_deps going negative", ump->um_fs->fs_fsmnt)); if (--ump->softdep_deps == 0 && ump->softdep_req) wakeup(&ump->softdep_deps); KASSERT(dep_current[item->wk_type] > 0, ("workitem_free: %s: dep_current[%s] going negative", ump->um_fs->fs_fsmnt, TYPENAME(item->wk_type))); KASSERT(ump->softdep_curdeps[item->wk_type] > 0, ("workitem_free: %s: softdep_curdeps[%s] going negative", ump->um_fs->fs_fsmnt, TYPENAME(item->wk_type))); atomic_subtract_long(&dep_current[item->wk_type], 1); ump->softdep_curdeps[item->wk_type] -= 1; free(item, DtoM(type)); } static void workitem_alloc(item, type, mp) struct worklist *item; int type; struct mount *mp; { struct ufsmount *ump; item->wk_type = type; item->wk_mp = mp; item->wk_state = 0; ump = VFSTOUFS(mp); ACQUIRE_GBLLOCK(&lk); dep_current[type]++; if (dep_current[type] > dep_highuse[type]) dep_highuse[type] = dep_current[type]; dep_total[type]++; FREE_GBLLOCK(&lk); ACQUIRE_LOCK(ump); ump->softdep_curdeps[type] += 1; ump->softdep_deps++; ump->softdep_accdeps++; FREE_LOCK(ump); } static void workitem_reassign(item, newtype) struct worklist *item; int newtype; { struct ufsmount *ump; ump = VFSTOUFS(item->wk_mp); LOCK_OWNED(ump); KASSERT(ump->softdep_curdeps[item->wk_type] > 0, ("workitem_reassign: %s: softdep_curdeps[%s] going negative", VFSTOUFS(item->wk_mp)->um_fs->fs_fsmnt, TYPENAME(item->wk_type))); ump->softdep_curdeps[item->wk_type] -= 1; ump->softdep_curdeps[newtype] += 1; KASSERT(dep_current[item->wk_type] > 0, ("workitem_reassign: %s: dep_current[%s] going negative", VFSTOUFS(item->wk_mp)->um_fs->fs_fsmnt, TYPENAME(item->wk_type))); ACQUIRE_GBLLOCK(&lk); dep_current[newtype]++; dep_current[item->wk_type]--; if (dep_current[newtype] > dep_highuse[newtype]) dep_highuse[newtype] = dep_current[newtype]; dep_total[newtype]++; FREE_GBLLOCK(&lk); item->wk_type = newtype; } /* * Workitem queue management */ static int max_softdeps; /* maximum number of structs before slowdown */ static int tickdelay = 2; /* number of ticks to pause during slowdown */ static int proc_waiting; /* tracks whether we have a timeout posted */ static int *stat_countp; /* statistic to count in proc_waiting timeout */ static struct callout softdep_callout; static int req_clear_inodedeps; /* syncer process flush some inodedeps */ static int req_clear_remove; /* syncer process flush some freeblks */ static int softdep_flushcache = 0; /* Should we do BIO_FLUSH? */ /* * runtime statistics */ static int stat_flush_threads; /* number of softdep flushing threads */ static int stat_worklist_push; /* number of worklist cleanups */ static int stat_blk_limit_push; /* number of times block limit neared */ static int stat_ino_limit_push; /* number of times inode limit neared */ static int stat_blk_limit_hit; /* number of times block slowdown imposed */ static int stat_ino_limit_hit; /* number of times inode slowdown imposed */ static int stat_sync_limit_hit; /* number of synchronous slowdowns imposed */ static int stat_indir_blk_ptrs; /* bufs redirtied as indir ptrs not written */ static int stat_inode_bitmap; /* bufs redirtied as inode bitmap not written */ static int stat_direct_blk_ptrs;/* bufs redirtied as direct ptrs not written */ static int stat_dir_entry; /* bufs redirtied as dir entry cannot write */ static int stat_jaddref; /* bufs redirtied as ino bitmap can not write */ static int stat_jnewblk; /* bufs redirtied as blk bitmap can not write */ static int stat_journal_min; /* Times hit journal min threshold */ static int stat_journal_low; /* Times hit journal low threshold */ static int stat_journal_wait; /* Times blocked in jwait(). */ static int stat_jwait_filepage; /* Times blocked in jwait() for filepage. */ static int stat_jwait_freeblks; /* Times blocked in jwait() for freeblks. */ static int stat_jwait_inode; /* Times blocked in jwait() for inodes. */ static int stat_jwait_newblk; /* Times blocked in jwait() for newblks. */ static int stat_cleanup_high_delay; /* Maximum cleanup delay (in ticks) */ static int stat_cleanup_blkrequests; /* Number of block cleanup requests */ static int stat_cleanup_inorequests; /* Number of inode cleanup requests */ static int stat_cleanup_retries; /* Number of cleanups that needed to flush */ static int stat_cleanup_failures; /* Number of cleanup requests that failed */ static int stat_emptyjblocks; /* Number of potentially empty journal blocks */ SYSCTL_INT(_debug_softdep, OID_AUTO, max_softdeps, CTLFLAG_RW, &max_softdeps, 0, ""); SYSCTL_INT(_debug_softdep, OID_AUTO, tickdelay, CTLFLAG_RW, &tickdelay, 0, ""); SYSCTL_INT(_debug_softdep, OID_AUTO, flush_threads, CTLFLAG_RD, &stat_flush_threads, 0, ""); SYSCTL_INT(_debug_softdep, OID_AUTO, worklist_push, CTLFLAG_RW, &stat_worklist_push, 0,""); SYSCTL_INT(_debug_softdep, OID_AUTO, blk_limit_push, CTLFLAG_RW, &stat_blk_limit_push, 0,""); SYSCTL_INT(_debug_softdep, OID_AUTO, ino_limit_push, CTLFLAG_RW, &stat_ino_limit_push, 0,""); SYSCTL_INT(_debug_softdep, OID_AUTO, blk_limit_hit, CTLFLAG_RW, &stat_blk_limit_hit, 0, ""); SYSCTL_INT(_debug_softdep, OID_AUTO, ino_limit_hit, CTLFLAG_RW, &stat_ino_limit_hit, 0, ""); SYSCTL_INT(_debug_softdep, OID_AUTO, sync_limit_hit, CTLFLAG_RW, &stat_sync_limit_hit, 0, ""); SYSCTL_INT(_debug_softdep, OID_AUTO, indir_blk_ptrs, CTLFLAG_RW, &stat_indir_blk_ptrs, 0, ""); SYSCTL_INT(_debug_softdep, OID_AUTO, inode_bitmap, CTLFLAG_RW, &stat_inode_bitmap, 0, ""); SYSCTL_INT(_debug_softdep, OID_AUTO, direct_blk_ptrs, CTLFLAG_RW, &stat_direct_blk_ptrs, 0, ""); SYSCTL_INT(_debug_softdep, OID_AUTO, dir_entry, CTLFLAG_RW, &stat_dir_entry, 0, ""); SYSCTL_INT(_debug_softdep, OID_AUTO, jaddref_rollback, CTLFLAG_RW, &stat_jaddref, 0, ""); SYSCTL_INT(_debug_softdep, OID_AUTO, jnewblk_rollback, CTLFLAG_RW, &stat_jnewblk, 0, ""); SYSCTL_INT(_debug_softdep, OID_AUTO, journal_low, CTLFLAG_RW, &stat_journal_low, 0, ""); SYSCTL_INT(_debug_softdep, OID_AUTO, journal_min, CTLFLAG_RW, &stat_journal_min, 0, ""); SYSCTL_INT(_debug_softdep, OID_AUTO, journal_wait, CTLFLAG_RW, &stat_journal_wait, 0, ""); SYSCTL_INT(_debug_softdep, OID_AUTO, jwait_filepage, CTLFLAG_RW, &stat_jwait_filepage, 0, ""); SYSCTL_INT(_debug_softdep, OID_AUTO, jwait_freeblks, CTLFLAG_RW, &stat_jwait_freeblks, 0, ""); SYSCTL_INT(_debug_softdep, OID_AUTO, jwait_inode, CTLFLAG_RW, &stat_jwait_inode, 0, ""); SYSCTL_INT(_debug_softdep, OID_AUTO, jwait_newblk, CTLFLAG_RW, &stat_jwait_newblk, 0, ""); SYSCTL_INT(_debug_softdep, OID_AUTO, cleanup_blkrequests, CTLFLAG_RW, &stat_cleanup_blkrequests, 0, ""); SYSCTL_INT(_debug_softdep, OID_AUTO, cleanup_inorequests, CTLFLAG_RW, &stat_cleanup_inorequests, 0, ""); SYSCTL_INT(_debug_softdep, OID_AUTO, cleanup_high_delay, CTLFLAG_RW, &stat_cleanup_high_delay, 0, ""); SYSCTL_INT(_debug_softdep, OID_AUTO, cleanup_retries, CTLFLAG_RW, &stat_cleanup_retries, 0, ""); SYSCTL_INT(_debug_softdep, OID_AUTO, cleanup_failures, CTLFLAG_RW, &stat_cleanup_failures, 0, ""); SYSCTL_INT(_debug_softdep, OID_AUTO, flushcache, CTLFLAG_RW, &softdep_flushcache, 0, ""); SYSCTL_INT(_debug_softdep, OID_AUTO, emptyjblocks, CTLFLAG_RD, &stat_emptyjblocks, 0, ""); SYSCTL_DECL(_vfs_ffs); /* Whether to recompute the summary at mount time */ static int compute_summary_at_mount = 0; SYSCTL_INT(_vfs_ffs, OID_AUTO, compute_summary_at_mount, CTLFLAG_RW, &compute_summary_at_mount, 0, "Recompute summary at mount"); static int print_threads = 0; SYSCTL_INT(_debug_softdep, OID_AUTO, print_threads, CTLFLAG_RW, &print_threads, 0, "Notify flusher thread start/stop"); /* List of all filesystems mounted with soft updates */ static TAILQ_HEAD(, mount_softdeps) softdepmounts; /* * This function cleans the worklist for a filesystem. * Each filesystem running with soft dependencies gets its own * thread to run in this function. The thread is started up in * softdep_mount and shutdown in softdep_unmount. They show up * as part of the kernel "bufdaemon" process whose process * entry is available in bufdaemonproc. */ static int searchfailed; extern struct proc *bufdaemonproc; static void softdep_flush(addr) void *addr; { struct mount *mp; struct thread *td; struct ufsmount *ump; td = curthread; td->td_pflags |= TDP_NORUNNINGBUF; mp = (struct mount *)addr; ump = VFSTOUFS(mp); atomic_add_int(&stat_flush_threads, 1); ACQUIRE_LOCK(ump); ump->softdep_flags &= ~FLUSH_STARTING; wakeup(&ump->softdep_flushtd); FREE_LOCK(ump); if (print_threads) { if (stat_flush_threads == 1) printf("Running %s at pid %d\n", bufdaemonproc->p_comm, bufdaemonproc->p_pid); printf("Start thread %s\n", td->td_name); } for (;;) { while (softdep_process_worklist(mp, 0) > 0 || (MOUNTEDSUJ(mp) && VFSTOUFS(mp)->softdep_jblocks->jb_suspended)) kthread_suspend_check(); ACQUIRE_LOCK(ump); if ((ump->softdep_flags & (FLUSH_CLEANUP | FLUSH_EXIT)) == 0) msleep(&ump->softdep_flushtd, LOCK_PTR(ump), PVM, "sdflush", hz / 2); ump->softdep_flags &= ~FLUSH_CLEANUP; /* * Check to see if we are done and need to exit. */ if ((ump->softdep_flags & FLUSH_EXIT) == 0) { FREE_LOCK(ump); continue; } ump->softdep_flags &= ~FLUSH_EXIT; FREE_LOCK(ump); wakeup(&ump->softdep_flags); if (print_threads) printf("Stop thread %s: searchfailed %d, did cleanups %d\n", td->td_name, searchfailed, ump->um_softdep->sd_cleanups); atomic_subtract_int(&stat_flush_threads, 1); kthread_exit(); panic("kthread_exit failed\n"); } } static void worklist_speedup(mp) struct mount *mp; { struct ufsmount *ump; ump = VFSTOUFS(mp); LOCK_OWNED(ump); if ((ump->softdep_flags & (FLUSH_CLEANUP | FLUSH_EXIT)) == 0) ump->softdep_flags |= FLUSH_CLEANUP; wakeup(&ump->softdep_flushtd); } static int softdep_speedup(ump) struct ufsmount *ump; { struct ufsmount *altump; struct mount_softdeps *sdp; LOCK_OWNED(ump); worklist_speedup(ump->um_mountp); bd_speedup(); /* * If we have global shortages, then we need other * filesystems to help with the cleanup. Here we wakeup a * flusher thread for a filesystem that is over its fair * share of resources. */ if (req_clear_inodedeps || req_clear_remove) { ACQUIRE_GBLLOCK(&lk); TAILQ_FOREACH(sdp, &softdepmounts, sd_next) { if ((altump = sdp->sd_ump) == ump) continue; if (((req_clear_inodedeps && altump->softdep_curdeps[D_INODEDEP] > max_softdeps / stat_flush_threads) || (req_clear_remove && altump->softdep_curdeps[D_DIRREM] > (max_softdeps / 2) / stat_flush_threads)) && TRY_ACQUIRE_LOCK(altump)) break; } if (sdp == NULL) { searchfailed++; FREE_GBLLOCK(&lk); } else { /* * Move to the end of the list so we pick a * different one on out next try. */ TAILQ_REMOVE(&softdepmounts, sdp, sd_next); TAILQ_INSERT_TAIL(&softdepmounts, sdp, sd_next); FREE_GBLLOCK(&lk); if ((altump->softdep_flags & (FLUSH_CLEANUP | FLUSH_EXIT)) == 0) altump->softdep_flags |= FLUSH_CLEANUP; altump->um_softdep->sd_cleanups++; wakeup(&altump->softdep_flushtd); FREE_LOCK(altump); } } return (speedup_syncer()); } /* * Add an item to the end of the work queue. * This routine requires that the lock be held. * This is the only routine that adds items to the list. * The following routine is the only one that removes items * and does so in order from first to last. */ #define WK_HEAD 0x0001 /* Add to HEAD. */ #define WK_NODELAY 0x0002 /* Process immediately. */ static void add_to_worklist(wk, flags) struct worklist *wk; int flags; { struct ufsmount *ump; ump = VFSTOUFS(wk->wk_mp); LOCK_OWNED(ump); if (wk->wk_state & ONWORKLIST) panic("add_to_worklist: %s(0x%X) already on list", TYPENAME(wk->wk_type), wk->wk_state); wk->wk_state |= ONWORKLIST; if (ump->softdep_on_worklist == 0) { LIST_INSERT_HEAD(&ump->softdep_workitem_pending, wk, wk_list); ump->softdep_worklist_tail = wk; } else if (flags & WK_HEAD) { LIST_INSERT_HEAD(&ump->softdep_workitem_pending, wk, wk_list); } else { LIST_INSERT_AFTER(ump->softdep_worklist_tail, wk, wk_list); ump->softdep_worklist_tail = wk; } ump->softdep_on_worklist += 1; if (flags & WK_NODELAY) worklist_speedup(wk->wk_mp); } /* * Remove the item to be processed. If we are removing the last * item on the list, we need to recalculate the tail pointer. */ static void remove_from_worklist(wk) struct worklist *wk; { struct ufsmount *ump; ump = VFSTOUFS(wk->wk_mp); WORKLIST_REMOVE(wk); if (ump->softdep_worklist_tail == wk) ump->softdep_worklist_tail = (struct worklist *)wk->wk_list.le_prev; ump->softdep_on_worklist -= 1; } static void wake_worklist(wk) struct worklist *wk; { if (wk->wk_state & IOWAITING) { wk->wk_state &= ~IOWAITING; wakeup(wk); } } static void wait_worklist(wk, wmesg) struct worklist *wk; char *wmesg; { struct ufsmount *ump; ump = VFSTOUFS(wk->wk_mp); wk->wk_state |= IOWAITING; msleep(wk, LOCK_PTR(ump), PVM, wmesg, 0); } /* * Process that runs once per second to handle items in the background queue. * * Note that we ensure that everything is done in the order in which they * appear in the queue. The code below depends on this property to ensure * that blocks of a file are freed before the inode itself is freed. This * ordering ensures that no new triples will be generated * until all the old ones have been purged from the dependency lists. */ static int softdep_process_worklist(mp, full) struct mount *mp; int full; { int cnt, matchcnt; struct ufsmount *ump; long starttime; KASSERT(mp != NULL, ("softdep_process_worklist: NULL mp")); if (MOUNTEDSOFTDEP(mp) == 0) return (0); matchcnt = 0; ump = VFSTOUFS(mp); ACQUIRE_LOCK(ump); starttime = time_second; softdep_process_journal(mp, NULL, full ? MNT_WAIT : 0); check_clear_deps(mp); while (ump->softdep_on_worklist > 0) { if ((cnt = process_worklist_item(mp, 10, LK_NOWAIT)) == 0) break; else matchcnt += cnt; check_clear_deps(mp); /* * We do not generally want to stop for buffer space, but if * we are really being a buffer hog, we will stop and wait. */ if (should_yield()) { FREE_LOCK(ump); kern_yield(PRI_USER); bwillwrite(); ACQUIRE_LOCK(ump); } /* * Never allow processing to run for more than one * second. This gives the syncer thread the opportunity * to pause if appropriate. */ if (!full && starttime != time_second) break; } if (full == 0) journal_unsuspend(ump); FREE_LOCK(ump); return (matchcnt); } /* * Process all removes associated with a vnode if we are running out of * journal space. Any other process which attempts to flush these will * be unable as we have the vnodes locked. */ static void process_removes(vp) struct vnode *vp; { struct inodedep *inodedep; struct dirrem *dirrem; struct ufsmount *ump; struct mount *mp; ino_t inum; mp = vp->v_mount; ump = VFSTOUFS(mp); LOCK_OWNED(ump); inum = VTOI(vp)->i_number; for (;;) { top: if (inodedep_lookup(mp, inum, 0, &inodedep) == 0) return; LIST_FOREACH(dirrem, &inodedep->id_dirremhd, dm_inonext) { /* * If another thread is trying to lock this vnode * it will fail but we must wait for it to do so * before we can proceed. */ if (dirrem->dm_state & INPROGRESS) { wait_worklist(&dirrem->dm_list, "pwrwait"); goto top; } if ((dirrem->dm_state & (COMPLETE | ONWORKLIST)) == (COMPLETE | ONWORKLIST)) break; } if (dirrem == NULL) return; remove_from_worklist(&dirrem->dm_list); FREE_LOCK(ump); if (vn_start_secondary_write(NULL, &mp, V_NOWAIT)) panic("process_removes: suspended filesystem"); handle_workitem_remove(dirrem, 0); vn_finished_secondary_write(mp); ACQUIRE_LOCK(ump); } } /* * Process all truncations associated with a vnode if we are running out * of journal space. This is called when the vnode lock is already held * and no other process can clear the truncation. This function returns * a value greater than zero if it did any work. */ static void process_truncates(vp) struct vnode *vp; { struct inodedep *inodedep; struct freeblks *freeblks; struct ufsmount *ump; struct mount *mp; ino_t inum; int cgwait; mp = vp->v_mount; ump = VFSTOUFS(mp); LOCK_OWNED(ump); inum = VTOI(vp)->i_number; for (;;) { if (inodedep_lookup(mp, inum, 0, &inodedep) == 0) return; cgwait = 0; TAILQ_FOREACH(freeblks, &inodedep->id_freeblklst, fb_next) { /* Journal entries not yet written. */ if (!LIST_EMPTY(&freeblks->fb_jblkdephd)) { jwait(&LIST_FIRST( &freeblks->fb_jblkdephd)->jb_list, MNT_WAIT); break; } /* Another thread is executing this item. */ if (freeblks->fb_state & INPROGRESS) { wait_worklist(&freeblks->fb_list, "ptrwait"); break; } /* Freeblks is waiting on a inode write. */ if ((freeblks->fb_state & COMPLETE) == 0) { FREE_LOCK(ump); ffs_update(vp, 1); ACQUIRE_LOCK(ump); break; } if ((freeblks->fb_state & (ALLCOMPLETE | ONWORKLIST)) == (ALLCOMPLETE | ONWORKLIST)) { remove_from_worklist(&freeblks->fb_list); freeblks->fb_state |= INPROGRESS; FREE_LOCK(ump); if (vn_start_secondary_write(NULL, &mp, V_NOWAIT)) panic("process_truncates: " "suspended filesystem"); handle_workitem_freeblocks(freeblks, 0); vn_finished_secondary_write(mp); ACQUIRE_LOCK(ump); break; } if (freeblks->fb_cgwait) cgwait++; } if (cgwait) { FREE_LOCK(ump); sync_cgs(mp, MNT_WAIT); ffs_sync_snap(mp, MNT_WAIT); ACQUIRE_LOCK(ump); continue; } if (freeblks == NULL) break; } return; } /* * Process one item on the worklist. */ static int process_worklist_item(mp, target, flags) struct mount *mp; int target; int flags; { struct worklist sentinel; struct worklist *wk; struct ufsmount *ump; int matchcnt; int error; KASSERT(mp != NULL, ("process_worklist_item: NULL mp")); /* * If we are being called because of a process doing a * copy-on-write, then it is not safe to write as we may * recurse into the copy-on-write routine. */ if (curthread->td_pflags & TDP_COWINPROGRESS) return (-1); PHOLD(curproc); /* Don't let the stack go away. */ ump = VFSTOUFS(mp); LOCK_OWNED(ump); matchcnt = 0; sentinel.wk_mp = NULL; sentinel.wk_type = D_SENTINEL; LIST_INSERT_HEAD(&ump->softdep_workitem_pending, &sentinel, wk_list); for (wk = LIST_NEXT(&sentinel, wk_list); wk != NULL; wk = LIST_NEXT(&sentinel, wk_list)) { if (wk->wk_type == D_SENTINEL) { LIST_REMOVE(&sentinel, wk_list); LIST_INSERT_AFTER(wk, &sentinel, wk_list); continue; } if (wk->wk_state & INPROGRESS) panic("process_worklist_item: %p already in progress.", wk); wk->wk_state |= INPROGRESS; remove_from_worklist(wk); FREE_LOCK(ump); if (vn_start_secondary_write(NULL, &mp, V_NOWAIT)) panic("process_worklist_item: suspended filesystem"); switch (wk->wk_type) { case D_DIRREM: /* removal of a directory entry */ error = handle_workitem_remove(WK_DIRREM(wk), flags); break; case D_FREEBLKS: /* releasing blocks and/or fragments from a file */ error = handle_workitem_freeblocks(WK_FREEBLKS(wk), flags); break; case D_FREEFRAG: /* releasing a fragment when replaced as a file grows */ handle_workitem_freefrag(WK_FREEFRAG(wk)); error = 0; break; case D_FREEFILE: /* releasing an inode when its link count drops to 0 */ handle_workitem_freefile(WK_FREEFILE(wk)); error = 0; break; default: panic("%s_process_worklist: Unknown type %s", "softdep", TYPENAME(wk->wk_type)); /* NOTREACHED */ } vn_finished_secondary_write(mp); ACQUIRE_LOCK(ump); if (error == 0) { if (++matchcnt == target) break; continue; } /* * We have to retry the worklist item later. Wake up any * waiters who may be able to complete it immediately and * add the item back to the head so we don't try to execute * it again. */ wk->wk_state &= ~INPROGRESS; wake_worklist(wk); add_to_worklist(wk, WK_HEAD); } LIST_REMOVE(&sentinel, wk_list); /* Sentinal could've become the tail from remove_from_worklist. */ if (ump->softdep_worklist_tail == &sentinel) ump->softdep_worklist_tail = (struct worklist *)sentinel.wk_list.le_prev; PRELE(curproc); return (matchcnt); } /* * Move dependencies from one buffer to another. */ int softdep_move_dependencies(oldbp, newbp) struct buf *oldbp; struct buf *newbp; { struct worklist *wk, *wktail; struct ufsmount *ump; int dirty; if ((wk = LIST_FIRST(&oldbp->b_dep)) == NULL) return (0); KASSERT(MOUNTEDSOFTDEP(wk->wk_mp) != 0, ("softdep_move_dependencies called on non-softdep filesystem")); dirty = 0; wktail = NULL; ump = VFSTOUFS(wk->wk_mp); ACQUIRE_LOCK(ump); while ((wk = LIST_FIRST(&oldbp->b_dep)) != NULL) { LIST_REMOVE(wk, wk_list); if (wk->wk_type == D_BMSAFEMAP && bmsafemap_backgroundwrite(WK_BMSAFEMAP(wk), newbp)) dirty = 1; if (wktail == NULL) LIST_INSERT_HEAD(&newbp->b_dep, wk, wk_list); else LIST_INSERT_AFTER(wktail, wk, wk_list); wktail = wk; } FREE_LOCK(ump); return (dirty); } /* * Purge the work list of all items associated with a particular mount point. */ int softdep_flushworklist(oldmnt, countp, td) struct mount *oldmnt; int *countp; struct thread *td; { struct vnode *devvp; struct ufsmount *ump; int count, error; /* * Alternately flush the block device associated with the mount * point and process any dependencies that the flushing * creates. We continue until no more worklist dependencies * are found. */ *countp = 0; error = 0; ump = VFSTOUFS(oldmnt); devvp = ump->um_devvp; while ((count = softdep_process_worklist(oldmnt, 1)) > 0) { *countp += count; vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY); error = VOP_FSYNC(devvp, MNT_WAIT, td); VOP_UNLOCK(devvp, 0); if (error != 0) break; } return (error); } #define SU_WAITIDLE_RETRIES 20 static int softdep_waitidle(struct mount *mp, int flags __unused) { struct ufsmount *ump; struct vnode *devvp; struct thread *td; int error, i; ump = VFSTOUFS(mp); devvp = ump->um_devvp; td = curthread; error = 0; ACQUIRE_LOCK(ump); for (i = 0; i < SU_WAITIDLE_RETRIES && ump->softdep_deps != 0; i++) { ump->softdep_req = 1; KASSERT((flags & FORCECLOSE) == 0 || ump->softdep_on_worklist == 0, ("softdep_waitidle: work added after flush")); msleep(&ump->softdep_deps, LOCK_PTR(ump), PVM | PDROP, "softdeps", 10 * hz); vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY); error = VOP_FSYNC(devvp, MNT_WAIT, td); VOP_UNLOCK(devvp, 0); ACQUIRE_LOCK(ump); if (error != 0) break; } ump->softdep_req = 0; if (i == SU_WAITIDLE_RETRIES && error == 0 && ump->softdep_deps != 0) { error = EBUSY; printf("softdep_waitidle: Failed to flush worklist for %p\n", mp); } FREE_LOCK(ump); return (error); } /* * Flush all vnodes and worklist items associated with a specified mount point. */ int softdep_flushfiles(oldmnt, flags, td) struct mount *oldmnt; int flags; struct thread *td; { #ifdef QUOTA struct ufsmount *ump; int i; #endif int error, early, depcount, loopcnt, retry_flush_count, retry; int morework; KASSERT(MOUNTEDSOFTDEP(oldmnt) != 0, ("softdep_flushfiles called on non-softdep filesystem")); loopcnt = 10; retry_flush_count = 3; retry_flush: error = 0; /* * Alternately flush the vnodes associated with the mount * point and process any dependencies that the flushing * creates. In theory, this loop can happen at most twice, * but we give it a few extra just to be sure. */ for (; loopcnt > 0; loopcnt--) { /* * Do another flush in case any vnodes were brought in * as part of the cleanup operations. */ early = retry_flush_count == 1 || (oldmnt->mnt_kern_flag & MNTK_UNMOUNT) == 0 ? 0 : EARLYFLUSH; if ((error = ffs_flushfiles(oldmnt, flags | early, td)) != 0) break; if ((error = softdep_flushworklist(oldmnt, &depcount, td)) != 0 || depcount == 0) break; } /* * If we are unmounting then it is an error to fail. If we * are simply trying to downgrade to read-only, then filesystem * activity can keep us busy forever, so we just fail with EBUSY. */ if (loopcnt == 0) { if (oldmnt->mnt_kern_flag & MNTK_UNMOUNT) panic("softdep_flushfiles: looping"); error = EBUSY; } if (!error) error = softdep_waitidle(oldmnt, flags); if (!error) { if (oldmnt->mnt_kern_flag & MNTK_UNMOUNT) { retry = 0; MNT_ILOCK(oldmnt); KASSERT((oldmnt->mnt_kern_flag & MNTK_NOINSMNTQ) != 0, ("softdep_flushfiles: !MNTK_NOINSMNTQ")); morework = oldmnt->mnt_nvnodelistsize > 0; #ifdef QUOTA ump = VFSTOUFS(oldmnt); UFS_LOCK(ump); for (i = 0; i < MAXQUOTAS; i++) { if (ump->um_quotas[i] != NULLVP) morework = 1; } UFS_UNLOCK(ump); #endif if (morework) { if (--retry_flush_count > 0) { retry = 1; loopcnt = 3; } else error = EBUSY; } MNT_IUNLOCK(oldmnt); if (retry) goto retry_flush; } } return (error); } /* * Structure hashing. * * There are four types of structures that can be looked up: * 1) pagedep structures identified by mount point, inode number, * and logical block. * 2) inodedep structures identified by mount point and inode number. * 3) newblk structures identified by mount point and * physical block number. * 4) bmsafemap structures identified by mount point and * cylinder group number. * * The "pagedep" and "inodedep" dependency structures are hashed * separately from the file blocks and inodes to which they correspond. * This separation helps when the in-memory copy of an inode or * file block must be replaced. It also obviates the need to access * an inode or file page when simply updating (or de-allocating) * dependency structures. Lookup of newblk structures is needed to * find newly allocated blocks when trying to associate them with * their allocdirect or allocindir structure. * * The lookup routines optionally create and hash a new instance when * an existing entry is not found. The bmsafemap lookup routine always * allocates a new structure if an existing one is not found. */ #define DEPALLOC 0x0001 /* allocate structure if lookup fails */ /* * Structures and routines associated with pagedep caching. */ #define PAGEDEP_HASH(ump, inum, lbn) \ (&(ump)->pagedep_hashtbl[((inum) + (lbn)) & (ump)->pagedep_hash_size]) static int pagedep_find(pagedephd, ino, lbn, pagedeppp) struct pagedep_hashhead *pagedephd; ino_t ino; ufs_lbn_t lbn; struct pagedep **pagedeppp; { struct pagedep *pagedep; LIST_FOREACH(pagedep, pagedephd, pd_hash) { if (ino == pagedep->pd_ino && lbn == pagedep->pd_lbn) { *pagedeppp = pagedep; return (1); } } *pagedeppp = NULL; return (0); } /* * Look up a pagedep. Return 1 if found, 0 otherwise. * If not found, allocate if DEPALLOC flag is passed. * Found or allocated entry is returned in pagedeppp. * This routine must be called with splbio interrupts blocked. */ static int pagedep_lookup(mp, bp, ino, lbn, flags, pagedeppp) struct mount *mp; struct buf *bp; ino_t ino; ufs_lbn_t lbn; int flags; struct pagedep **pagedeppp; { struct pagedep *pagedep; struct pagedep_hashhead *pagedephd; struct worklist *wk; struct ufsmount *ump; int ret; int i; ump = VFSTOUFS(mp); LOCK_OWNED(ump); if (bp) { LIST_FOREACH(wk, &bp->b_dep, wk_list) { if (wk->wk_type == D_PAGEDEP) { *pagedeppp = WK_PAGEDEP(wk); return (1); } } } pagedephd = PAGEDEP_HASH(ump, ino, lbn); ret = pagedep_find(pagedephd, ino, lbn, pagedeppp); if (ret) { if (((*pagedeppp)->pd_state & ONWORKLIST) == 0 && bp) WORKLIST_INSERT(&bp->b_dep, &(*pagedeppp)->pd_list); return (1); } if ((flags & DEPALLOC) == 0) return (0); FREE_LOCK(ump); pagedep = malloc(sizeof(struct pagedep), M_PAGEDEP, M_SOFTDEP_FLAGS|M_ZERO); workitem_alloc(&pagedep->pd_list, D_PAGEDEP, mp); ACQUIRE_LOCK(ump); ret = pagedep_find(pagedephd, ino, lbn, pagedeppp); if (*pagedeppp) { /* * This should never happen since we only create pagedeps * with the vnode lock held. Could be an assert. */ WORKITEM_FREE(pagedep, D_PAGEDEP); return (ret); } pagedep->pd_ino = ino; pagedep->pd_lbn = lbn; LIST_INIT(&pagedep->pd_dirremhd); LIST_INIT(&pagedep->pd_pendinghd); for (i = 0; i < DAHASHSZ; i++) LIST_INIT(&pagedep->pd_diraddhd[i]); LIST_INSERT_HEAD(pagedephd, pagedep, pd_hash); WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list); *pagedeppp = pagedep; return (0); } /* * Structures and routines associated with inodedep caching. */ #define INODEDEP_HASH(ump, inum) \ (&(ump)->inodedep_hashtbl[(inum) & (ump)->inodedep_hash_size]) static int inodedep_find(inodedephd, inum, inodedeppp) struct inodedep_hashhead *inodedephd; ino_t inum; struct inodedep **inodedeppp; { struct inodedep *inodedep; LIST_FOREACH(inodedep, inodedephd, id_hash) if (inum == inodedep->id_ino) break; if (inodedep) { *inodedeppp = inodedep; return (1); } *inodedeppp = NULL; return (0); } /* * Look up an inodedep. Return 1 if found, 0 if not found. * If not found, allocate if DEPALLOC flag is passed. * Found or allocated entry is returned in inodedeppp. * This routine must be called with splbio interrupts blocked. */ static int inodedep_lookup(mp, inum, flags, inodedeppp) struct mount *mp; ino_t inum; int flags; struct inodedep **inodedeppp; { struct inodedep *inodedep; struct inodedep_hashhead *inodedephd; struct ufsmount *ump; struct fs *fs; ump = VFSTOUFS(mp); LOCK_OWNED(ump); fs = ump->um_fs; inodedephd = INODEDEP_HASH(ump, inum); if (inodedep_find(inodedephd, inum, inodedeppp)) return (1); if ((flags & DEPALLOC) == 0) return (0); /* * If the system is over its limit and our filesystem is * responsible for more than our share of that usage and * we are not in a rush, request some inodedep cleanup. */ if (softdep_excess_items(ump, D_INODEDEP)) schedule_cleanup(mp); else FREE_LOCK(ump); inodedep = malloc(sizeof(struct inodedep), M_INODEDEP, M_SOFTDEP_FLAGS); workitem_alloc(&inodedep->id_list, D_INODEDEP, mp); ACQUIRE_LOCK(ump); if (inodedep_find(inodedephd, inum, inodedeppp)) { WORKITEM_FREE(inodedep, D_INODEDEP); return (1); } inodedep->id_fs = fs; inodedep->id_ino = inum; inodedep->id_state = ALLCOMPLETE; inodedep->id_nlinkdelta = 0; inodedep->id_savedino1 = NULL; inodedep->id_savedsize = -1; inodedep->id_savedextsize = -1; inodedep->id_savednlink = -1; inodedep->id_bmsafemap = NULL; inodedep->id_mkdiradd = NULL; LIST_INIT(&inodedep->id_dirremhd); LIST_INIT(&inodedep->id_pendinghd); LIST_INIT(&inodedep->id_inowait); LIST_INIT(&inodedep->id_bufwait); TAILQ_INIT(&inodedep->id_inoreflst); TAILQ_INIT(&inodedep->id_inoupdt); TAILQ_INIT(&inodedep->id_newinoupdt); TAILQ_INIT(&inodedep->id_extupdt); TAILQ_INIT(&inodedep->id_newextupdt); TAILQ_INIT(&inodedep->id_freeblklst); LIST_INSERT_HEAD(inodedephd, inodedep, id_hash); *inodedeppp = inodedep; return (0); } /* * Structures and routines associated with newblk caching. */ #define NEWBLK_HASH(ump, inum) \ (&(ump)->newblk_hashtbl[(inum) & (ump)->newblk_hash_size]) static int newblk_find(newblkhd, newblkno, flags, newblkpp) struct newblk_hashhead *newblkhd; ufs2_daddr_t newblkno; int flags; struct newblk **newblkpp; { struct newblk *newblk; LIST_FOREACH(newblk, newblkhd, nb_hash) { if (newblkno != newblk->nb_newblkno) continue; /* * If we're creating a new dependency don't match those that * have already been converted to allocdirects. This is for * a frag extend. */ if ((flags & DEPALLOC) && newblk->nb_list.wk_type != D_NEWBLK) continue; break; } if (newblk) { *newblkpp = newblk; return (1); } *newblkpp = NULL; return (0); } /* * Look up a newblk. Return 1 if found, 0 if not found. * If not found, allocate if DEPALLOC flag is passed. * Found or allocated entry is returned in newblkpp. */ static int newblk_lookup(mp, newblkno, flags, newblkpp) struct mount *mp; ufs2_daddr_t newblkno; int flags; struct newblk **newblkpp; { struct newblk *newblk; struct newblk_hashhead *newblkhd; struct ufsmount *ump; ump = VFSTOUFS(mp); LOCK_OWNED(ump); newblkhd = NEWBLK_HASH(ump, newblkno); if (newblk_find(newblkhd, newblkno, flags, newblkpp)) return (1); if ((flags & DEPALLOC) == 0) return (0); if (softdep_excess_items(ump, D_NEWBLK) || softdep_excess_items(ump, D_ALLOCDIRECT) || softdep_excess_items(ump, D_ALLOCINDIR)) schedule_cleanup(mp); else FREE_LOCK(ump); newblk = malloc(sizeof(union allblk), M_NEWBLK, M_SOFTDEP_FLAGS | M_ZERO); workitem_alloc(&newblk->nb_list, D_NEWBLK, mp); ACQUIRE_LOCK(ump); if (newblk_find(newblkhd, newblkno, flags, newblkpp)) { WORKITEM_FREE(newblk, D_NEWBLK); return (1); } newblk->nb_freefrag = NULL; LIST_INIT(&newblk->nb_indirdeps); LIST_INIT(&newblk->nb_newdirblk); LIST_INIT(&newblk->nb_jwork); newblk->nb_state = ATTACHED; newblk->nb_newblkno = newblkno; LIST_INSERT_HEAD(newblkhd, newblk, nb_hash); *newblkpp = newblk; return (0); } /* * Structures and routines associated with freed indirect block caching. */ #define INDIR_HASH(ump, blkno) \ (&(ump)->indir_hashtbl[(blkno) & (ump)->indir_hash_size]) /* * Lookup an indirect block in the indir hash table. The freework is * removed and potentially freed. The caller must do a blocking journal * write before writing to the blkno. */ static int indirblk_lookup(mp, blkno) struct mount *mp; ufs2_daddr_t blkno; { struct freework *freework; struct indir_hashhead *wkhd; struct ufsmount *ump; ump = VFSTOUFS(mp); wkhd = INDIR_HASH(ump, blkno); TAILQ_FOREACH(freework, wkhd, fw_next) { if (freework->fw_blkno != blkno) continue; indirblk_remove(freework); return (1); } return (0); } /* * Insert an indirect block represented by freework into the indirblk * hash table so that it may prevent the block from being re-used prior * to the journal being written. */ static void indirblk_insert(freework) struct freework *freework; { struct jblocks *jblocks; struct jseg *jseg; struct ufsmount *ump; ump = VFSTOUFS(freework->fw_list.wk_mp); jblocks = ump->softdep_jblocks; jseg = TAILQ_LAST(&jblocks->jb_segs, jseglst); if (jseg == NULL) return; LIST_INSERT_HEAD(&jseg->js_indirs, freework, fw_segs); TAILQ_INSERT_HEAD(INDIR_HASH(ump, freework->fw_blkno), freework, fw_next); freework->fw_state &= ~DEPCOMPLETE; } static void indirblk_remove(freework) struct freework *freework; { struct ufsmount *ump; ump = VFSTOUFS(freework->fw_list.wk_mp); LIST_REMOVE(freework, fw_segs); TAILQ_REMOVE(INDIR_HASH(ump, freework->fw_blkno), freework, fw_next); freework->fw_state |= DEPCOMPLETE; if ((freework->fw_state & ALLCOMPLETE) == ALLCOMPLETE) WORKITEM_FREE(freework, D_FREEWORK); } /* * Executed during filesystem system initialization before * mounting any filesystems. */ void softdep_initialize() { TAILQ_INIT(&softdepmounts); #ifdef __LP64__ max_softdeps = desiredvnodes * 4; #else max_softdeps = desiredvnodes * 2; #endif /* initialise bioops hack */ bioops.io_start = softdep_disk_io_initiation; bioops.io_complete = softdep_disk_write_complete; bioops.io_deallocate = softdep_deallocate_dependencies; bioops.io_countdeps = softdep_count_dependencies; softdep_ast_cleanup = softdep_ast_cleanup_proc; /* Initialize the callout with an mtx. */ callout_init_mtx(&softdep_callout, &lk, 0); } /* * Executed after all filesystems have been unmounted during * filesystem module unload. */ void softdep_uninitialize() { /* clear bioops hack */ bioops.io_start = NULL; bioops.io_complete = NULL; bioops.io_deallocate = NULL; bioops.io_countdeps = NULL; softdep_ast_cleanup = NULL; callout_drain(&softdep_callout); } /* * Called at mount time to notify the dependency code that a * filesystem wishes to use it. */ int softdep_mount(devvp, mp, fs, cred) struct vnode *devvp; struct mount *mp; struct fs *fs; struct ucred *cred; { struct csum_total cstotal; struct mount_softdeps *sdp; struct ufsmount *ump; struct cg *cgp; struct buf *bp; int i, error, cyl; sdp = malloc(sizeof(struct mount_softdeps), M_MOUNTDATA, M_WAITOK | M_ZERO); MNT_ILOCK(mp); mp->mnt_flag = (mp->mnt_flag & ~MNT_ASYNC) | MNT_SOFTDEP; if ((mp->mnt_kern_flag & MNTK_SOFTDEP) == 0) { mp->mnt_kern_flag = (mp->mnt_kern_flag & ~MNTK_ASYNC) | MNTK_SOFTDEP | MNTK_NOASYNC; } ump = VFSTOUFS(mp); ump->um_softdep = sdp; MNT_IUNLOCK(mp); rw_init(LOCK_PTR(ump), "Per-Filesystem Softdep Lock"); sdp->sd_ump = ump; LIST_INIT(&ump->softdep_workitem_pending); LIST_INIT(&ump->softdep_journal_pending); TAILQ_INIT(&ump->softdep_unlinked); LIST_INIT(&ump->softdep_dirtycg); ump->softdep_worklist_tail = NULL; ump->softdep_on_worklist = 0; ump->softdep_deps = 0; LIST_INIT(&ump->softdep_mkdirlisthd); ump->pagedep_hashtbl = hashinit(desiredvnodes / 5, M_PAGEDEP, &ump->pagedep_hash_size); ump->pagedep_nextclean = 0; ump->inodedep_hashtbl = hashinit(desiredvnodes, M_INODEDEP, &ump->inodedep_hash_size); ump->inodedep_nextclean = 0; ump->newblk_hashtbl = hashinit(max_softdeps / 2, M_NEWBLK, &ump->newblk_hash_size); ump->bmsafemap_hashtbl = hashinit(1024, M_BMSAFEMAP, &ump->bmsafemap_hash_size); i = 1 << (ffs(desiredvnodes / 10) - 1); ump->indir_hashtbl = malloc(i * sizeof(struct indir_hashhead), M_FREEWORK, M_WAITOK); ump->indir_hash_size = i - 1; for (i = 0; i <= ump->indir_hash_size; i++) TAILQ_INIT(&ump->indir_hashtbl[i]); ACQUIRE_GBLLOCK(&lk); TAILQ_INSERT_TAIL(&softdepmounts, sdp, sd_next); FREE_GBLLOCK(&lk); if ((fs->fs_flags & FS_SUJ) && (error = journal_mount(mp, fs, cred)) != 0) { printf("Failed to start journal: %d\n", error); softdep_unmount(mp); return (error); } /* * Start our flushing thread in the bufdaemon process. */ ACQUIRE_LOCK(ump); ump->softdep_flags |= FLUSH_STARTING; FREE_LOCK(ump); kproc_kthread_add(&softdep_flush, mp, &bufdaemonproc, &ump->softdep_flushtd, 0, 0, "softdepflush", "%s worker", mp->mnt_stat.f_mntonname); ACQUIRE_LOCK(ump); while ((ump->softdep_flags & FLUSH_STARTING) != 0) { msleep(&ump->softdep_flushtd, LOCK_PTR(ump), PVM, "sdstart", hz / 2); } FREE_LOCK(ump); /* * When doing soft updates, the counters in the * superblock may have gotten out of sync. Recomputation * can take a long time and can be deferred for background * fsck. However, the old behavior of scanning the cylinder * groups and recalculating them at mount time is available * by setting vfs.ffs.compute_summary_at_mount to one. */ if (compute_summary_at_mount == 0 || fs->fs_clean != 0) return (0); bzero(&cstotal, sizeof cstotal); for (cyl = 0; cyl < fs->fs_ncg; cyl++) { if ((error = bread(devvp, fsbtodb(fs, cgtod(fs, cyl)), fs->fs_cgsize, cred, &bp)) != 0) { brelse(bp); softdep_unmount(mp); return (error); } cgp = (struct cg *)bp->b_data; cstotal.cs_nffree += cgp->cg_cs.cs_nffree; cstotal.cs_nbfree += cgp->cg_cs.cs_nbfree; cstotal.cs_nifree += cgp->cg_cs.cs_nifree; cstotal.cs_ndir += cgp->cg_cs.cs_ndir; fs->fs_cs(fs, cyl) = cgp->cg_cs; brelse(bp); } #ifdef DEBUG if (bcmp(&cstotal, &fs->fs_cstotal, sizeof cstotal)) printf("%s: superblock summary recomputed\n", fs->fs_fsmnt); #endif bcopy(&cstotal, &fs->fs_cstotal, sizeof cstotal); return (0); } void softdep_unmount(mp) struct mount *mp; { struct ufsmount *ump; #ifdef INVARIANTS int i; #endif KASSERT(MOUNTEDSOFTDEP(mp) != 0, ("softdep_unmount called on non-softdep filesystem")); ump = VFSTOUFS(mp); MNT_ILOCK(mp); mp->mnt_flag &= ~MNT_SOFTDEP; if (MOUNTEDSUJ(mp) == 0) { MNT_IUNLOCK(mp); } else { mp->mnt_flag &= ~MNT_SUJ; MNT_IUNLOCK(mp); journal_unmount(ump); } /* * Shut down our flushing thread. Check for NULL is if * softdep_mount errors out before the thread has been created. */ if (ump->softdep_flushtd != NULL) { ACQUIRE_LOCK(ump); ump->softdep_flags |= FLUSH_EXIT; wakeup(&ump->softdep_flushtd); msleep(&ump->softdep_flags, LOCK_PTR(ump), PVM | PDROP, "sdwait", 0); KASSERT((ump->softdep_flags & FLUSH_EXIT) == 0, ("Thread shutdown failed")); } /* * Free up our resources. */ ACQUIRE_GBLLOCK(&lk); TAILQ_REMOVE(&softdepmounts, ump->um_softdep, sd_next); FREE_GBLLOCK(&lk); rw_destroy(LOCK_PTR(ump)); hashdestroy(ump->pagedep_hashtbl, M_PAGEDEP, ump->pagedep_hash_size); hashdestroy(ump->inodedep_hashtbl, M_INODEDEP, ump->inodedep_hash_size); hashdestroy(ump->newblk_hashtbl, M_NEWBLK, ump->newblk_hash_size); hashdestroy(ump->bmsafemap_hashtbl, M_BMSAFEMAP, ump->bmsafemap_hash_size); free(ump->indir_hashtbl, M_FREEWORK); #ifdef INVARIANTS for (i = 0; i <= D_LAST; i++) KASSERT(ump->softdep_curdeps[i] == 0, ("Unmount %s: Dep type %s != 0 (%ld)", ump->um_fs->fs_fsmnt, TYPENAME(i), ump->softdep_curdeps[i])); #endif free(ump->um_softdep, M_MOUNTDATA); } static struct jblocks * jblocks_create(void) { struct jblocks *jblocks; jblocks = malloc(sizeof(*jblocks), M_JBLOCKS, M_WAITOK | M_ZERO); TAILQ_INIT(&jblocks->jb_segs); jblocks->jb_avail = 10; jblocks->jb_extent = malloc(sizeof(struct jextent) * jblocks->jb_avail, M_JBLOCKS, M_WAITOK | M_ZERO); return (jblocks); } static ufs2_daddr_t jblocks_alloc(jblocks, bytes, actual) struct jblocks *jblocks; int bytes; int *actual; { ufs2_daddr_t daddr; struct jextent *jext; int freecnt; int blocks; blocks = bytes / DEV_BSIZE; jext = &jblocks->jb_extent[jblocks->jb_head]; freecnt = jext->je_blocks - jblocks->jb_off; if (freecnt == 0) { jblocks->jb_off = 0; if (++jblocks->jb_head > jblocks->jb_used) jblocks->jb_head = 0; jext = &jblocks->jb_extent[jblocks->jb_head]; freecnt = jext->je_blocks; } if (freecnt > blocks) freecnt = blocks; *actual = freecnt * DEV_BSIZE; daddr = jext->je_daddr + jblocks->jb_off; jblocks->jb_off += freecnt; jblocks->jb_free -= freecnt; return (daddr); } static void jblocks_free(jblocks, mp, bytes) struct jblocks *jblocks; struct mount *mp; int bytes; { LOCK_OWNED(VFSTOUFS(mp)); jblocks->jb_free += bytes / DEV_BSIZE; if (jblocks->jb_suspended) worklist_speedup(mp); wakeup(jblocks); } static void jblocks_destroy(jblocks) struct jblocks *jblocks; { if (jblocks->jb_extent) free(jblocks->jb_extent, M_JBLOCKS); free(jblocks, M_JBLOCKS); } static void jblocks_add(jblocks, daddr, blocks) struct jblocks *jblocks; ufs2_daddr_t daddr; int blocks; { struct jextent *jext; jblocks->jb_blocks += blocks; jblocks->jb_free += blocks; jext = &jblocks->jb_extent[jblocks->jb_used]; /* Adding the first block. */ if (jext->je_daddr == 0) { jext->je_daddr = daddr; jext->je_blocks = blocks; return; } /* Extending the last extent. */ if (jext->je_daddr + jext->je_blocks == daddr) { jext->je_blocks += blocks; return; } /* Adding a new extent. */ if (++jblocks->jb_used == jblocks->jb_avail) { jblocks->jb_avail *= 2; jext = malloc(sizeof(struct jextent) * jblocks->jb_avail, M_JBLOCKS, M_WAITOK | M_ZERO); memcpy(jext, jblocks->jb_extent, sizeof(struct jextent) * jblocks->jb_used); free(jblocks->jb_extent, M_JBLOCKS); jblocks->jb_extent = jext; } jext = &jblocks->jb_extent[jblocks->jb_used]; jext->je_daddr = daddr; jext->je_blocks = blocks; return; } int softdep_journal_lookup(mp, vpp) struct mount *mp; struct vnode **vpp; { struct componentname cnp; struct vnode *dvp; ino_t sujournal; int error; - error = VFS_VGET(mp, ROOTINO, LK_EXCLUSIVE, &dvp); + error = VFS_VGET(mp, UFS_ROOTINO, LK_EXCLUSIVE, &dvp); if (error) return (error); bzero(&cnp, sizeof(cnp)); cnp.cn_nameiop = LOOKUP; cnp.cn_flags = ISLASTCN; cnp.cn_thread = curthread; cnp.cn_cred = curthread->td_ucred; cnp.cn_pnbuf = SUJ_FILE; cnp.cn_nameptr = SUJ_FILE; cnp.cn_namelen = strlen(SUJ_FILE); error = ufs_lookup_ino(dvp, NULL, &cnp, &sujournal); vput(dvp); if (error != 0) return (error); error = VFS_VGET(mp, sujournal, LK_EXCLUSIVE, vpp); return (error); } /* * Open and verify the journal file. */ static int journal_mount(mp, fs, cred) struct mount *mp; struct fs *fs; struct ucred *cred; { struct jblocks *jblocks; struct ufsmount *ump; struct vnode *vp; struct inode *ip; ufs2_daddr_t blkno; int bcount; int error; int i; ump = VFSTOUFS(mp); ump->softdep_journal_tail = NULL; ump->softdep_on_journal = 0; ump->softdep_accdeps = 0; ump->softdep_req = 0; ump->softdep_jblocks = NULL; error = softdep_journal_lookup(mp, &vp); if (error != 0) { printf("Failed to find journal. Use tunefs to create one\n"); return (error); } ip = VTOI(vp); if (ip->i_size < SUJ_MIN) { error = ENOSPC; goto out; } bcount = lblkno(fs, ip->i_size); /* Only use whole blocks. */ jblocks = jblocks_create(); for (i = 0; i < bcount; i++) { error = ufs_bmaparray(vp, i, &blkno, NULL, NULL, NULL); if (error) break; jblocks_add(jblocks, blkno, fsbtodb(fs, fs->fs_frag)); } if (error) { jblocks_destroy(jblocks); goto out; } jblocks->jb_low = jblocks->jb_free / 3; /* Reserve 33%. */ jblocks->jb_min = jblocks->jb_free / 10; /* Suspend at 10%. */ ump->softdep_jblocks = jblocks; out: if (error == 0) { MNT_ILOCK(mp); mp->mnt_flag |= MNT_SUJ; mp->mnt_flag &= ~MNT_SOFTDEP; MNT_IUNLOCK(mp); /* * Only validate the journal contents if the * filesystem is clean, otherwise we write the logs * but they'll never be used. If the filesystem was * still dirty when we mounted it the journal is * invalid and a new journal can only be valid if it * starts from a clean mount. */ if (fs->fs_clean) { DIP_SET(ip, i_modrev, fs->fs_mtime); ip->i_flags |= IN_MODIFIED; ffs_update(vp, 1); } } vput(vp); return (error); } static void journal_unmount(ump) struct ufsmount *ump; { if (ump->softdep_jblocks) jblocks_destroy(ump->softdep_jblocks); ump->softdep_jblocks = NULL; } /* * Called when a journal record is ready to be written. Space is allocated * and the journal entry is created when the journal is flushed to stable * store. */ static void add_to_journal(wk) struct worklist *wk; { struct ufsmount *ump; ump = VFSTOUFS(wk->wk_mp); LOCK_OWNED(ump); if (wk->wk_state & ONWORKLIST) panic("add_to_journal: %s(0x%X) already on list", TYPENAME(wk->wk_type), wk->wk_state); wk->wk_state |= ONWORKLIST | DEPCOMPLETE; if (LIST_EMPTY(&ump->softdep_journal_pending)) { ump->softdep_jblocks->jb_age = ticks; LIST_INSERT_HEAD(&ump->softdep_journal_pending, wk, wk_list); } else LIST_INSERT_AFTER(ump->softdep_journal_tail, wk, wk_list); ump->softdep_journal_tail = wk; ump->softdep_on_journal += 1; } /* * Remove an arbitrary item for the journal worklist maintain the tail * pointer. This happens when a new operation obviates the need to * journal an old operation. */ static void remove_from_journal(wk) struct worklist *wk; { struct ufsmount *ump; ump = VFSTOUFS(wk->wk_mp); LOCK_OWNED(ump); #ifdef SUJ_DEBUG { struct worklist *wkn; LIST_FOREACH(wkn, &ump->softdep_journal_pending, wk_list) if (wkn == wk) break; if (wkn == NULL) panic("remove_from_journal: %p is not in journal", wk); } #endif /* * We emulate a TAILQ to save space in most structures which do not * require TAILQ semantics. Here we must update the tail position * when removing the tail which is not the final entry. This works * only if the worklist linkage are at the beginning of the structure. */ if (ump->softdep_journal_tail == wk) ump->softdep_journal_tail = (struct worklist *)wk->wk_list.le_prev; WORKLIST_REMOVE(wk); ump->softdep_on_journal -= 1; } /* * Check for journal space as well as dependency limits so the prelink * code can throttle both journaled and non-journaled filesystems. * Threshold is 0 for low and 1 for min. */ static int journal_space(ump, thresh) struct ufsmount *ump; int thresh; { struct jblocks *jblocks; int limit, avail; jblocks = ump->softdep_jblocks; if (jblocks == NULL) return (1); /* * We use a tighter restriction here to prevent request_cleanup() * running in threads from running into locks we currently hold. * We have to be over the limit and our filesystem has to be * responsible for more than our share of that usage. */ limit = (max_softdeps / 10) * 9; if (dep_current[D_INODEDEP] > limit && ump->softdep_curdeps[D_INODEDEP] > limit / stat_flush_threads) return (0); if (thresh) thresh = jblocks->jb_min; else thresh = jblocks->jb_low; avail = (ump->softdep_on_journal * JREC_SIZE) / DEV_BSIZE; avail = jblocks->jb_free - avail; return (avail > thresh); } static void journal_suspend(ump) struct ufsmount *ump; { struct jblocks *jblocks; struct mount *mp; mp = UFSTOVFS(ump); jblocks = ump->softdep_jblocks; MNT_ILOCK(mp); if ((mp->mnt_kern_flag & MNTK_SUSPEND) == 0) { stat_journal_min++; mp->mnt_kern_flag |= MNTK_SUSPEND; mp->mnt_susp_owner = ump->softdep_flushtd; } jblocks->jb_suspended = 1; MNT_IUNLOCK(mp); } static int journal_unsuspend(struct ufsmount *ump) { struct jblocks *jblocks; struct mount *mp; mp = UFSTOVFS(ump); jblocks = ump->softdep_jblocks; if (jblocks != NULL && jblocks->jb_suspended && journal_space(ump, jblocks->jb_min)) { jblocks->jb_suspended = 0; FREE_LOCK(ump); mp->mnt_susp_owner = curthread; vfs_write_resume(mp, 0); ACQUIRE_LOCK(ump); return (1); } return (0); } /* * Called before any allocation function to be certain that there is * sufficient space in the journal prior to creating any new records. * Since in the case of block allocation we may have multiple locked * buffers at the time of the actual allocation we can not block * when the journal records are created. Doing so would create a deadlock * if any of these buffers needed to be flushed to reclaim space. Instead * we require a sufficiently large amount of available space such that * each thread in the system could have passed this allocation check and * still have sufficient free space. With 20% of a minimum journal size * of 1MB we have 6553 records available. */ int softdep_prealloc(vp, waitok) struct vnode *vp; int waitok; { struct ufsmount *ump; KASSERT(MOUNTEDSOFTDEP(vp->v_mount) != 0, ("softdep_prealloc called on non-softdep filesystem")); /* * Nothing to do if we are not running journaled soft updates. * If we currently hold the snapshot lock, we must avoid * handling other resources that could cause deadlock. Do not * touch quotas vnode since it is typically recursed with * other vnode locks held. */ if (DOINGSUJ(vp) == 0 || IS_SNAPSHOT(VTOI(vp)) || (vp->v_vflag & VV_SYSTEM) != 0) return (0); ump = VFSTOUFS(vp->v_mount); ACQUIRE_LOCK(ump); if (journal_space(ump, 0)) { FREE_LOCK(ump); return (0); } stat_journal_low++; FREE_LOCK(ump); if (waitok == MNT_NOWAIT) return (ENOSPC); /* * Attempt to sync this vnode once to flush any journal * work attached to it. */ if ((curthread->td_pflags & TDP_COWINPROGRESS) == 0) ffs_syncvnode(vp, waitok, 0); ACQUIRE_LOCK(ump); process_removes(vp); process_truncates(vp); if (journal_space(ump, 0) == 0) { softdep_speedup(ump); if (journal_space(ump, 1) == 0) journal_suspend(ump); } FREE_LOCK(ump); return (0); } /* * Before adjusting a link count on a vnode verify that we have sufficient * journal space. If not, process operations that depend on the currently * locked pair of vnodes to try to flush space as the syncer, buf daemon, * and softdep flush threads can not acquire these locks to reclaim space. */ static void softdep_prelink(dvp, vp) struct vnode *dvp; struct vnode *vp; { struct ufsmount *ump; ump = VFSTOUFS(dvp->v_mount); LOCK_OWNED(ump); /* * Nothing to do if we have sufficient journal space. * If we currently hold the snapshot lock, we must avoid * handling other resources that could cause deadlock. */ if (journal_space(ump, 0) || (vp && IS_SNAPSHOT(VTOI(vp)))) return; stat_journal_low++; FREE_LOCK(ump); if (vp) ffs_syncvnode(vp, MNT_NOWAIT, 0); ffs_syncvnode(dvp, MNT_WAIT, 0); ACQUIRE_LOCK(ump); /* Process vp before dvp as it may create .. removes. */ if (vp) { process_removes(vp); process_truncates(vp); } process_removes(dvp); process_truncates(dvp); softdep_speedup(ump); process_worklist_item(UFSTOVFS(ump), 2, LK_NOWAIT); if (journal_space(ump, 0) == 0) { softdep_speedup(ump); if (journal_space(ump, 1) == 0) journal_suspend(ump); } } static void jseg_write(ump, jseg, data) struct ufsmount *ump; struct jseg *jseg; uint8_t *data; { struct jsegrec *rec; rec = (struct jsegrec *)data; rec->jsr_seq = jseg->js_seq; rec->jsr_oldest = jseg->js_oldseq; rec->jsr_cnt = jseg->js_cnt; rec->jsr_blocks = jseg->js_size / ump->um_devvp->v_bufobj.bo_bsize; rec->jsr_crc = 0; rec->jsr_time = ump->um_fs->fs_mtime; } static inline void inoref_write(inoref, jseg, rec) struct inoref *inoref; struct jseg *jseg; struct jrefrec *rec; { inoref->if_jsegdep->jd_seg = jseg; rec->jr_ino = inoref->if_ino; rec->jr_parent = inoref->if_parent; rec->jr_nlink = inoref->if_nlink; rec->jr_mode = inoref->if_mode; rec->jr_diroff = inoref->if_diroff; } static void jaddref_write(jaddref, jseg, data) struct jaddref *jaddref; struct jseg *jseg; uint8_t *data; { struct jrefrec *rec; rec = (struct jrefrec *)data; rec->jr_op = JOP_ADDREF; inoref_write(&jaddref->ja_ref, jseg, rec); } static void jremref_write(jremref, jseg, data) struct jremref *jremref; struct jseg *jseg; uint8_t *data; { struct jrefrec *rec; rec = (struct jrefrec *)data; rec->jr_op = JOP_REMREF; inoref_write(&jremref->jr_ref, jseg, rec); } static void jmvref_write(jmvref, jseg, data) struct jmvref *jmvref; struct jseg *jseg; uint8_t *data; { struct jmvrec *rec; rec = (struct jmvrec *)data; rec->jm_op = JOP_MVREF; rec->jm_ino = jmvref->jm_ino; rec->jm_parent = jmvref->jm_parent; rec->jm_oldoff = jmvref->jm_oldoff; rec->jm_newoff = jmvref->jm_newoff; } static void jnewblk_write(jnewblk, jseg, data) struct jnewblk *jnewblk; struct jseg *jseg; uint8_t *data; { struct jblkrec *rec; jnewblk->jn_jsegdep->jd_seg = jseg; rec = (struct jblkrec *)data; rec->jb_op = JOP_NEWBLK; rec->jb_ino = jnewblk->jn_ino; rec->jb_blkno = jnewblk->jn_blkno; rec->jb_lbn = jnewblk->jn_lbn; rec->jb_frags = jnewblk->jn_frags; rec->jb_oldfrags = jnewblk->jn_oldfrags; } static void jfreeblk_write(jfreeblk, jseg, data) struct jfreeblk *jfreeblk; struct jseg *jseg; uint8_t *data; { struct jblkrec *rec; jfreeblk->jf_dep.jb_jsegdep->jd_seg = jseg; rec = (struct jblkrec *)data; rec->jb_op = JOP_FREEBLK; rec->jb_ino = jfreeblk->jf_ino; rec->jb_blkno = jfreeblk->jf_blkno; rec->jb_lbn = jfreeblk->jf_lbn; rec->jb_frags = jfreeblk->jf_frags; rec->jb_oldfrags = 0; } static void jfreefrag_write(jfreefrag, jseg, data) struct jfreefrag *jfreefrag; struct jseg *jseg; uint8_t *data; { struct jblkrec *rec; jfreefrag->fr_jsegdep->jd_seg = jseg; rec = (struct jblkrec *)data; rec->jb_op = JOP_FREEBLK; rec->jb_ino = jfreefrag->fr_ino; rec->jb_blkno = jfreefrag->fr_blkno; rec->jb_lbn = jfreefrag->fr_lbn; rec->jb_frags = jfreefrag->fr_frags; rec->jb_oldfrags = 0; } static void jtrunc_write(jtrunc, jseg, data) struct jtrunc *jtrunc; struct jseg *jseg; uint8_t *data; { struct jtrncrec *rec; jtrunc->jt_dep.jb_jsegdep->jd_seg = jseg; rec = (struct jtrncrec *)data; rec->jt_op = JOP_TRUNC; rec->jt_ino = jtrunc->jt_ino; rec->jt_size = jtrunc->jt_size; rec->jt_extsize = jtrunc->jt_extsize; } static void jfsync_write(jfsync, jseg, data) struct jfsync *jfsync; struct jseg *jseg; uint8_t *data; { struct jtrncrec *rec; rec = (struct jtrncrec *)data; rec->jt_op = JOP_SYNC; rec->jt_ino = jfsync->jfs_ino; rec->jt_size = jfsync->jfs_size; rec->jt_extsize = jfsync->jfs_extsize; } static void softdep_flushjournal(mp) struct mount *mp; { struct jblocks *jblocks; struct ufsmount *ump; if (MOUNTEDSUJ(mp) == 0) return; ump = VFSTOUFS(mp); jblocks = ump->softdep_jblocks; ACQUIRE_LOCK(ump); while (ump->softdep_on_journal) { jblocks->jb_needseg = 1; softdep_process_journal(mp, NULL, MNT_WAIT); } FREE_LOCK(ump); } static void softdep_synchronize_completed(struct bio *); static void softdep_synchronize(struct bio *, struct ufsmount *, void *); static void softdep_synchronize_completed(bp) struct bio *bp; { struct jseg *oldest; struct jseg *jseg; struct ufsmount *ump; /* * caller1 marks the last segment written before we issued the * synchronize cache. */ jseg = bp->bio_caller1; if (jseg == NULL) { g_destroy_bio(bp); return; } ump = VFSTOUFS(jseg->js_list.wk_mp); ACQUIRE_LOCK(ump); oldest = NULL; /* * Mark all the journal entries waiting on the synchronize cache * as completed so they may continue on. */ while (jseg != NULL && (jseg->js_state & COMPLETE) == 0) { jseg->js_state |= COMPLETE; oldest = jseg; jseg = TAILQ_PREV(jseg, jseglst, js_next); } /* * Restart deferred journal entry processing from the oldest * completed jseg. */ if (oldest) complete_jsegs(oldest); FREE_LOCK(ump); g_destroy_bio(bp); } /* * Send BIO_FLUSH/SYNCHRONIZE CACHE to the device to enforce write ordering * barriers. The journal must be written prior to any blocks that depend * on it and the journal can not be released until the blocks have be * written. This code handles both barriers simultaneously. */ static void softdep_synchronize(bp, ump, caller1) struct bio *bp; struct ufsmount *ump; void *caller1; { bp->bio_cmd = BIO_FLUSH; bp->bio_flags |= BIO_ORDERED; bp->bio_data = NULL; bp->bio_offset = ump->um_cp->provider->mediasize; bp->bio_length = 0; bp->bio_done = softdep_synchronize_completed; bp->bio_caller1 = caller1; g_io_request(bp, (struct g_consumer *)ump->um_devvp->v_bufobj.bo_private); } /* * Flush some journal records to disk. */ static void softdep_process_journal(mp, needwk, flags) struct mount *mp; struct worklist *needwk; int flags; { struct jblocks *jblocks; struct ufsmount *ump; struct worklist *wk; struct jseg *jseg; struct buf *bp; struct bio *bio; uint8_t *data; struct fs *fs; int shouldflush; int segwritten; int jrecmin; /* Minimum records per block. */ int jrecmax; /* Maximum records per block. */ int size; int cnt; int off; int devbsize; if (MOUNTEDSUJ(mp) == 0) return; shouldflush = softdep_flushcache; bio = NULL; jseg = NULL; ump = VFSTOUFS(mp); LOCK_OWNED(ump); fs = ump->um_fs; jblocks = ump->softdep_jblocks; devbsize = ump->um_devvp->v_bufobj.bo_bsize; /* * We write anywhere between a disk block and fs block. The upper * bound is picked to prevent buffer cache fragmentation and limit * processing time per I/O. */ jrecmin = (devbsize / JREC_SIZE) - 1; /* -1 for seg header */ jrecmax = (fs->fs_bsize / devbsize) * jrecmin; segwritten = 0; for (;;) { cnt = ump->softdep_on_journal; /* * Criteria for writing a segment: * 1) We have a full block. * 2) We're called from jwait() and haven't found the * journal item yet. * 3) Always write if needseg is set. * 4) If we are called from process_worklist and have * not yet written anything we write a partial block * to enforce a 1 second maximum latency on journal * entries. */ if (cnt < (jrecmax - 1) && needwk == NULL && jblocks->jb_needseg == 0 && (segwritten || cnt == 0)) break; cnt++; /* * Verify some free journal space. softdep_prealloc() should * guarantee that we don't run out so this is indicative of * a problem with the flow control. Try to recover * gracefully in any event. */ while (jblocks->jb_free == 0) { if (flags != MNT_WAIT) break; printf("softdep: Out of journal space!\n"); softdep_speedup(ump); msleep(jblocks, LOCK_PTR(ump), PRIBIO, "jblocks", hz); } FREE_LOCK(ump); jseg = malloc(sizeof(*jseg), M_JSEG, M_SOFTDEP_FLAGS); workitem_alloc(&jseg->js_list, D_JSEG, mp); LIST_INIT(&jseg->js_entries); LIST_INIT(&jseg->js_indirs); jseg->js_state = ATTACHED; if (shouldflush == 0) jseg->js_state |= COMPLETE; else if (bio == NULL) bio = g_alloc_bio(); jseg->js_jblocks = jblocks; bp = geteblk(fs->fs_bsize, 0); ACQUIRE_LOCK(ump); /* * If there was a race while we were allocating the block * and jseg the entry we care about was likely written. * We bail out in both the WAIT and NOWAIT case and assume * the caller will loop if the entry it cares about is * not written. */ cnt = ump->softdep_on_journal; if (cnt + jblocks->jb_needseg == 0 || jblocks->jb_free == 0) { bp->b_flags |= B_INVAL | B_NOCACHE; WORKITEM_FREE(jseg, D_JSEG); FREE_LOCK(ump); brelse(bp); ACQUIRE_LOCK(ump); break; } /* * Calculate the disk block size required for the available * records rounded to the min size. */ if (cnt == 0) size = devbsize; else if (cnt < jrecmax) size = howmany(cnt, jrecmin) * devbsize; else size = fs->fs_bsize; /* * Allocate a disk block for this journal data and account * for truncation of the requested size if enough contiguous * space was not available. */ bp->b_blkno = jblocks_alloc(jblocks, size, &size); bp->b_lblkno = bp->b_blkno; bp->b_offset = bp->b_blkno * DEV_BSIZE; bp->b_bcount = size; bp->b_flags &= ~B_INVAL; bp->b_flags |= B_VALIDSUSPWRT | B_NOCOPY; /* * Initialize our jseg with cnt records. Assign the next * sequence number to it and link it in-order. */ cnt = MIN(cnt, (size / devbsize) * jrecmin); jseg->js_buf = bp; jseg->js_cnt = cnt; jseg->js_refs = cnt + 1; /* Self ref. */ jseg->js_size = size; jseg->js_seq = jblocks->jb_nextseq++; if (jblocks->jb_oldestseg == NULL) jblocks->jb_oldestseg = jseg; jseg->js_oldseq = jblocks->jb_oldestseg->js_seq; TAILQ_INSERT_TAIL(&jblocks->jb_segs, jseg, js_next); if (jblocks->jb_writeseg == NULL) jblocks->jb_writeseg = jseg; /* * Start filling in records from the pending list. */ data = bp->b_data; off = 0; /* * Always put a header on the first block. * XXX As with below, there might not be a chance to get * into the loop. Ensure that something valid is written. */ jseg_write(ump, jseg, data); off += JREC_SIZE; data = bp->b_data + off; /* * XXX Something is wrong here. There's no work to do, * but we need to perform and I/O and allow it to complete * anyways. */ if (LIST_EMPTY(&ump->softdep_journal_pending)) stat_emptyjblocks++; while ((wk = LIST_FIRST(&ump->softdep_journal_pending)) != NULL) { if (cnt == 0) break; /* Place a segment header on every device block. */ if ((off % devbsize) == 0) { jseg_write(ump, jseg, data); off += JREC_SIZE; data = bp->b_data + off; } if (wk == needwk) needwk = NULL; remove_from_journal(wk); wk->wk_state |= INPROGRESS; WORKLIST_INSERT(&jseg->js_entries, wk); switch (wk->wk_type) { case D_JADDREF: jaddref_write(WK_JADDREF(wk), jseg, data); break; case D_JREMREF: jremref_write(WK_JREMREF(wk), jseg, data); break; case D_JMVREF: jmvref_write(WK_JMVREF(wk), jseg, data); break; case D_JNEWBLK: jnewblk_write(WK_JNEWBLK(wk), jseg, data); break; case D_JFREEBLK: jfreeblk_write(WK_JFREEBLK(wk), jseg, data); break; case D_JFREEFRAG: jfreefrag_write(WK_JFREEFRAG(wk), jseg, data); break; case D_JTRUNC: jtrunc_write(WK_JTRUNC(wk), jseg, data); break; case D_JFSYNC: jfsync_write(WK_JFSYNC(wk), jseg, data); break; default: panic("process_journal: Unknown type %s", TYPENAME(wk->wk_type)); /* NOTREACHED */ } off += JREC_SIZE; data = bp->b_data + off; cnt--; } /* Clear any remaining space so we don't leak kernel data */ if (size > off) bzero(data, size - off); /* * Write this one buffer and continue. */ segwritten = 1; jblocks->jb_needseg = 0; WORKLIST_INSERT(&bp->b_dep, &jseg->js_list); FREE_LOCK(ump); pbgetvp(ump->um_devvp, bp); /* * We only do the blocking wait once we find the journal * entry we're looking for. */ if (needwk == NULL && flags == MNT_WAIT) bwrite(bp); else bawrite(bp); ACQUIRE_LOCK(ump); } /* * If we wrote a segment issue a synchronize cache so the journal * is reflected on disk before the data is written. Since reclaiming * journal space also requires writing a journal record this * process also enforces a barrier before reclamation. */ if (segwritten && shouldflush) { softdep_synchronize(bio, ump, TAILQ_LAST(&jblocks->jb_segs, jseglst)); } else if (bio) g_destroy_bio(bio); /* * If we've suspended the filesystem because we ran out of journal * space either try to sync it here to make some progress or * unsuspend it if we already have. */ if (flags == 0 && jblocks->jb_suspended) { if (journal_unsuspend(ump)) return; FREE_LOCK(ump); VFS_SYNC(mp, MNT_NOWAIT); ffs_sbupdate(ump, MNT_WAIT, 0); ACQUIRE_LOCK(ump); } } /* * Complete a jseg, allowing all dependencies awaiting journal writes * to proceed. Each journal dependency also attaches a jsegdep to dependent * structures so that the journal segment can be freed to reclaim space. */ static void complete_jseg(jseg) struct jseg *jseg; { struct worklist *wk; struct jmvref *jmvref; int waiting; #ifdef INVARIANTS int i = 0; #endif while ((wk = LIST_FIRST(&jseg->js_entries)) != NULL) { WORKLIST_REMOVE(wk); waiting = wk->wk_state & IOWAITING; wk->wk_state &= ~(INPROGRESS | IOWAITING); wk->wk_state |= COMPLETE; KASSERT(i++ < jseg->js_cnt, ("handle_written_jseg: overflow %d >= %d", i - 1, jseg->js_cnt)); switch (wk->wk_type) { case D_JADDREF: handle_written_jaddref(WK_JADDREF(wk)); break; case D_JREMREF: handle_written_jremref(WK_JREMREF(wk)); break; case D_JMVREF: rele_jseg(jseg); /* No jsegdep. */ jmvref = WK_JMVREF(wk); LIST_REMOVE(jmvref, jm_deps); if ((jmvref->jm_pagedep->pd_state & ONWORKLIST) == 0) free_pagedep(jmvref->jm_pagedep); WORKITEM_FREE(jmvref, D_JMVREF); break; case D_JNEWBLK: handle_written_jnewblk(WK_JNEWBLK(wk)); break; case D_JFREEBLK: handle_written_jblkdep(&WK_JFREEBLK(wk)->jf_dep); break; case D_JTRUNC: handle_written_jblkdep(&WK_JTRUNC(wk)->jt_dep); break; case D_JFSYNC: rele_jseg(jseg); /* No jsegdep. */ WORKITEM_FREE(wk, D_JFSYNC); break; case D_JFREEFRAG: handle_written_jfreefrag(WK_JFREEFRAG(wk)); break; default: panic("handle_written_jseg: Unknown type %s", TYPENAME(wk->wk_type)); /* NOTREACHED */ } if (waiting) wakeup(wk); } /* Release the self reference so the structure may be freed. */ rele_jseg(jseg); } /* * Determine which jsegs are ready for completion processing. Waits for * synchronize cache to complete as well as forcing in-order completion * of journal entries. */ static void complete_jsegs(jseg) struct jseg *jseg; { struct jblocks *jblocks; struct jseg *jsegn; jblocks = jseg->js_jblocks; /* * Don't allow out of order completions. If this isn't the first * block wait for it to write before we're done. */ if (jseg != jblocks->jb_writeseg) return; /* Iterate through available jsegs processing their entries. */ while (jseg && (jseg->js_state & ALLCOMPLETE) == ALLCOMPLETE) { jblocks->jb_oldestwrseq = jseg->js_oldseq; jsegn = TAILQ_NEXT(jseg, js_next); complete_jseg(jseg); jseg = jsegn; } jblocks->jb_writeseg = jseg; /* * Attempt to free jsegs now that oldestwrseq may have advanced. */ free_jsegs(jblocks); } /* * Mark a jseg as DEPCOMPLETE and throw away the buffer. Attempt to handle * the final completions. */ static void handle_written_jseg(jseg, bp) struct jseg *jseg; struct buf *bp; { if (jseg->js_refs == 0) panic("handle_written_jseg: No self-reference on %p", jseg); jseg->js_state |= DEPCOMPLETE; /* * We'll never need this buffer again, set flags so it will be * discarded. */ bp->b_flags |= B_INVAL | B_NOCACHE; pbrelvp(bp); complete_jsegs(jseg); } static inline struct jsegdep * inoref_jseg(inoref) struct inoref *inoref; { struct jsegdep *jsegdep; jsegdep = inoref->if_jsegdep; inoref->if_jsegdep = NULL; return (jsegdep); } /* * Called once a jremref has made it to stable store. The jremref is marked * complete and we attempt to free it. Any pagedeps writes sleeping waiting * for the jremref to complete will be awoken by free_jremref. */ static void handle_written_jremref(jremref) struct jremref *jremref; { struct inodedep *inodedep; struct jsegdep *jsegdep; struct dirrem *dirrem; /* Grab the jsegdep. */ jsegdep = inoref_jseg(&jremref->jr_ref); /* * Remove us from the inoref list. */ if (inodedep_lookup(jremref->jr_list.wk_mp, jremref->jr_ref.if_ino, 0, &inodedep) == 0) panic("handle_written_jremref: Lost inodedep"); TAILQ_REMOVE(&inodedep->id_inoreflst, &jremref->jr_ref, if_deps); /* * Complete the dirrem. */ dirrem = jremref->jr_dirrem; jremref->jr_dirrem = NULL; LIST_REMOVE(jremref, jr_deps); jsegdep->jd_state |= jremref->jr_state & MKDIR_PARENT; jwork_insert(&dirrem->dm_jwork, jsegdep); if (LIST_EMPTY(&dirrem->dm_jremrefhd) && (dirrem->dm_state & COMPLETE) != 0) add_to_worklist(&dirrem->dm_list, 0); free_jremref(jremref); } /* * Called once a jaddref has made it to stable store. The dependency is * marked complete and any dependent structures are added to the inode * bufwait list to be completed as soon as it is written. If a bitmap write * depends on this entry we move the inode into the inodedephd of the * bmsafemap dependency and attempt to remove the jaddref from the bmsafemap. */ static void handle_written_jaddref(jaddref) struct jaddref *jaddref; { struct jsegdep *jsegdep; struct inodedep *inodedep; struct diradd *diradd; struct mkdir *mkdir; /* Grab the jsegdep. */ jsegdep = inoref_jseg(&jaddref->ja_ref); mkdir = NULL; diradd = NULL; if (inodedep_lookup(jaddref->ja_list.wk_mp, jaddref->ja_ino, 0, &inodedep) == 0) panic("handle_written_jaddref: Lost inodedep."); if (jaddref->ja_diradd == NULL) panic("handle_written_jaddref: No dependency"); if (jaddref->ja_diradd->da_list.wk_type == D_DIRADD) { diradd = jaddref->ja_diradd; WORKLIST_INSERT(&inodedep->id_bufwait, &diradd->da_list); } else if (jaddref->ja_state & MKDIR_PARENT) { mkdir = jaddref->ja_mkdir; WORKLIST_INSERT(&inodedep->id_bufwait, &mkdir->md_list); } else if (jaddref->ja_state & MKDIR_BODY) mkdir = jaddref->ja_mkdir; else panic("handle_written_jaddref: Unknown dependency %p", jaddref->ja_diradd); jaddref->ja_diradd = NULL; /* also clears ja_mkdir */ /* * Remove us from the inode list. */ TAILQ_REMOVE(&inodedep->id_inoreflst, &jaddref->ja_ref, if_deps); /* * The mkdir may be waiting on the jaddref to clear before freeing. */ if (mkdir) { KASSERT(mkdir->md_list.wk_type == D_MKDIR, ("handle_written_jaddref: Incorrect type for mkdir %s", TYPENAME(mkdir->md_list.wk_type))); mkdir->md_jaddref = NULL; diradd = mkdir->md_diradd; mkdir->md_state |= DEPCOMPLETE; complete_mkdir(mkdir); } jwork_insert(&diradd->da_jwork, jsegdep); if (jaddref->ja_state & NEWBLOCK) { inodedep->id_state |= ONDEPLIST; LIST_INSERT_HEAD(&inodedep->id_bmsafemap->sm_inodedephd, inodedep, id_deps); } free_jaddref(jaddref); } /* * Called once a jnewblk journal is written. The allocdirect or allocindir * is placed in the bmsafemap to await notification of a written bitmap. If * the operation was canceled we add the segdep to the appropriate * dependency to free the journal space once the canceling operation * completes. */ static void handle_written_jnewblk(jnewblk) struct jnewblk *jnewblk; { struct bmsafemap *bmsafemap; struct freefrag *freefrag; struct freework *freework; struct jsegdep *jsegdep; struct newblk *newblk; /* Grab the jsegdep. */ jsegdep = jnewblk->jn_jsegdep; jnewblk->jn_jsegdep = NULL; if (jnewblk->jn_dep == NULL) panic("handle_written_jnewblk: No dependency for the segdep."); switch (jnewblk->jn_dep->wk_type) { case D_NEWBLK: case D_ALLOCDIRECT: case D_ALLOCINDIR: /* * Add the written block to the bmsafemap so it can * be notified when the bitmap is on disk. */ newblk = WK_NEWBLK(jnewblk->jn_dep); newblk->nb_jnewblk = NULL; if ((newblk->nb_state & GOINGAWAY) == 0) { bmsafemap = newblk->nb_bmsafemap; newblk->nb_state |= ONDEPLIST; LIST_INSERT_HEAD(&bmsafemap->sm_newblkhd, newblk, nb_deps); } jwork_insert(&newblk->nb_jwork, jsegdep); break; case D_FREEFRAG: /* * A newblock being removed by a freefrag when replaced by * frag extension. */ freefrag = WK_FREEFRAG(jnewblk->jn_dep); freefrag->ff_jdep = NULL; jwork_insert(&freefrag->ff_jwork, jsegdep); break; case D_FREEWORK: /* * A direct block was removed by truncate. */ freework = WK_FREEWORK(jnewblk->jn_dep); freework->fw_jnewblk = NULL; jwork_insert(&freework->fw_freeblks->fb_jwork, jsegdep); break; default: panic("handle_written_jnewblk: Unknown type %d.", jnewblk->jn_dep->wk_type); } jnewblk->jn_dep = NULL; free_jnewblk(jnewblk); } /* * Cancel a jfreefrag that won't be needed, probably due to colliding with * an in-flight allocation that has not yet been committed. Divorce us * from the freefrag and mark it DEPCOMPLETE so that it may be added * to the worklist. */ static void cancel_jfreefrag(jfreefrag) struct jfreefrag *jfreefrag; { struct freefrag *freefrag; if (jfreefrag->fr_jsegdep) { free_jsegdep(jfreefrag->fr_jsegdep); jfreefrag->fr_jsegdep = NULL; } freefrag = jfreefrag->fr_freefrag; jfreefrag->fr_freefrag = NULL; free_jfreefrag(jfreefrag); freefrag->ff_state |= DEPCOMPLETE; CTR1(KTR_SUJ, "cancel_jfreefrag: blkno %jd", freefrag->ff_blkno); } /* * Free a jfreefrag when the parent freefrag is rendered obsolete. */ static void free_jfreefrag(jfreefrag) struct jfreefrag *jfreefrag; { if (jfreefrag->fr_state & INPROGRESS) WORKLIST_REMOVE(&jfreefrag->fr_list); else if (jfreefrag->fr_state & ONWORKLIST) remove_from_journal(&jfreefrag->fr_list); if (jfreefrag->fr_freefrag != NULL) panic("free_jfreefrag: Still attached to a freefrag."); WORKITEM_FREE(jfreefrag, D_JFREEFRAG); } /* * Called when the journal write for a jfreefrag completes. The parent * freefrag is added to the worklist if this completes its dependencies. */ static void handle_written_jfreefrag(jfreefrag) struct jfreefrag *jfreefrag; { struct jsegdep *jsegdep; struct freefrag *freefrag; /* Grab the jsegdep. */ jsegdep = jfreefrag->fr_jsegdep; jfreefrag->fr_jsegdep = NULL; freefrag = jfreefrag->fr_freefrag; if (freefrag == NULL) panic("handle_written_jfreefrag: No freefrag."); freefrag->ff_state |= DEPCOMPLETE; freefrag->ff_jdep = NULL; jwork_insert(&freefrag->ff_jwork, jsegdep); if ((freefrag->ff_state & ALLCOMPLETE) == ALLCOMPLETE) add_to_worklist(&freefrag->ff_list, 0); jfreefrag->fr_freefrag = NULL; free_jfreefrag(jfreefrag); } /* * Called when the journal write for a jfreeblk completes. The jfreeblk * is removed from the freeblks list of pending journal writes and the * jsegdep is moved to the freeblks jwork to be completed when all blocks * have been reclaimed. */ static void handle_written_jblkdep(jblkdep) struct jblkdep *jblkdep; { struct freeblks *freeblks; struct jsegdep *jsegdep; /* Grab the jsegdep. */ jsegdep = jblkdep->jb_jsegdep; jblkdep->jb_jsegdep = NULL; freeblks = jblkdep->jb_freeblks; LIST_REMOVE(jblkdep, jb_deps); jwork_insert(&freeblks->fb_jwork, jsegdep); /* * If the freeblks is all journaled, we can add it to the worklist. */ if (LIST_EMPTY(&freeblks->fb_jblkdephd) && (freeblks->fb_state & ALLCOMPLETE) == ALLCOMPLETE) add_to_worklist(&freeblks->fb_list, WK_NODELAY); free_jblkdep(jblkdep); } static struct jsegdep * newjsegdep(struct worklist *wk) { struct jsegdep *jsegdep; jsegdep = malloc(sizeof(*jsegdep), M_JSEGDEP, M_SOFTDEP_FLAGS); workitem_alloc(&jsegdep->jd_list, D_JSEGDEP, wk->wk_mp); jsegdep->jd_seg = NULL; return (jsegdep); } static struct jmvref * newjmvref(dp, ino, oldoff, newoff) struct inode *dp; ino_t ino; off_t oldoff; off_t newoff; { struct jmvref *jmvref; jmvref = malloc(sizeof(*jmvref), M_JMVREF, M_SOFTDEP_FLAGS); workitem_alloc(&jmvref->jm_list, D_JMVREF, ITOVFS(dp)); jmvref->jm_list.wk_state = ATTACHED | DEPCOMPLETE; jmvref->jm_parent = dp->i_number; jmvref->jm_ino = ino; jmvref->jm_oldoff = oldoff; jmvref->jm_newoff = newoff; return (jmvref); } /* * Allocate a new jremref that tracks the removal of ip from dp with the * directory entry offset of diroff. Mark the entry as ATTACHED and * DEPCOMPLETE as we have all the information required for the journal write * and the directory has already been removed from the buffer. The caller * is responsible for linking the jremref into the pagedep and adding it * to the journal to write. The MKDIR_PARENT flag is set if we're doing * a DOTDOT addition so handle_workitem_remove() can properly assign * the jsegdep when we're done. */ static struct jremref * newjremref(struct dirrem *dirrem, struct inode *dp, struct inode *ip, off_t diroff, nlink_t nlink) { struct jremref *jremref; jremref = malloc(sizeof(*jremref), M_JREMREF, M_SOFTDEP_FLAGS); workitem_alloc(&jremref->jr_list, D_JREMREF, ITOVFS(dp)); jremref->jr_state = ATTACHED; newinoref(&jremref->jr_ref, ip->i_number, dp->i_number, diroff, nlink, ip->i_mode); jremref->jr_dirrem = dirrem; return (jremref); } static inline void newinoref(struct inoref *inoref, ino_t ino, ino_t parent, off_t diroff, nlink_t nlink, uint16_t mode) { inoref->if_jsegdep = newjsegdep(&inoref->if_list); inoref->if_diroff = diroff; inoref->if_ino = ino; inoref->if_parent = parent; inoref->if_nlink = nlink; inoref->if_mode = mode; } /* * Allocate a new jaddref to track the addition of ino to dp at diroff. The * directory offset may not be known until later. The caller is responsible * adding the entry to the journal when this information is available. nlink * should be the link count prior to the addition and mode is only required * to have the correct FMT. */ static struct jaddref * newjaddref(struct inode *dp, ino_t ino, off_t diroff, int16_t nlink, uint16_t mode) { struct jaddref *jaddref; jaddref = malloc(sizeof(*jaddref), M_JADDREF, M_SOFTDEP_FLAGS); workitem_alloc(&jaddref->ja_list, D_JADDREF, ITOVFS(dp)); jaddref->ja_state = ATTACHED; jaddref->ja_mkdir = NULL; newinoref(&jaddref->ja_ref, ino, dp->i_number, diroff, nlink, mode); return (jaddref); } /* * Create a new free dependency for a freework. The caller is responsible * for adjusting the reference count when it has the lock held. The freedep * will track an outstanding bitmap write that will ultimately clear the * freework to continue. */ static struct freedep * newfreedep(struct freework *freework) { struct freedep *freedep; freedep = malloc(sizeof(*freedep), M_FREEDEP, M_SOFTDEP_FLAGS); workitem_alloc(&freedep->fd_list, D_FREEDEP, freework->fw_list.wk_mp); freedep->fd_freework = freework; return (freedep); } /* * Free a freedep structure once the buffer it is linked to is written. If * this is the last reference to the freework schedule it for completion. */ static void free_freedep(freedep) struct freedep *freedep; { struct freework *freework; freework = freedep->fd_freework; freework->fw_freeblks->fb_cgwait--; if (--freework->fw_ref == 0) freework_enqueue(freework); WORKITEM_FREE(freedep, D_FREEDEP); } /* * Allocate a new freework structure that may be a level in an indirect * when parent is not NULL or a top level block when it is. The top level * freework structures are allocated without the per-filesystem lock held * and before the freeblks is visible outside of softdep_setup_freeblocks(). */ static struct freework * newfreework(ump, freeblks, parent, lbn, nb, frags, off, journal) struct ufsmount *ump; struct freeblks *freeblks; struct freework *parent; ufs_lbn_t lbn; ufs2_daddr_t nb; int frags; int off; int journal; { struct freework *freework; freework = malloc(sizeof(*freework), M_FREEWORK, M_SOFTDEP_FLAGS); workitem_alloc(&freework->fw_list, D_FREEWORK, freeblks->fb_list.wk_mp); freework->fw_state = ATTACHED; freework->fw_jnewblk = NULL; freework->fw_freeblks = freeblks; freework->fw_parent = parent; freework->fw_lbn = lbn; freework->fw_blkno = nb; freework->fw_frags = frags; freework->fw_indir = NULL; - freework->fw_ref = (MOUNTEDSUJ(UFSTOVFS(ump)) == 0 || lbn >= -NXADDR) - ? 0 : NINDIR(ump->um_fs) + 1; + freework->fw_ref = (MOUNTEDSUJ(UFSTOVFS(ump)) == 0 || + lbn >= -UFS_NXADDR) ? 0 : NINDIR(ump->um_fs) + 1; freework->fw_start = freework->fw_off = off; if (journal) newjfreeblk(freeblks, lbn, nb, frags); if (parent == NULL) { ACQUIRE_LOCK(ump); WORKLIST_INSERT(&freeblks->fb_freeworkhd, &freework->fw_list); freeblks->fb_ref++; FREE_LOCK(ump); } return (freework); } /* * Eliminate a jfreeblk for a block that does not need journaling. */ static void cancel_jfreeblk(freeblks, blkno) struct freeblks *freeblks; ufs2_daddr_t blkno; { struct jfreeblk *jfreeblk; struct jblkdep *jblkdep; LIST_FOREACH(jblkdep, &freeblks->fb_jblkdephd, jb_deps) { if (jblkdep->jb_list.wk_type != D_JFREEBLK) continue; jfreeblk = WK_JFREEBLK(&jblkdep->jb_list); if (jfreeblk->jf_blkno == blkno) break; } if (jblkdep == NULL) return; CTR1(KTR_SUJ, "cancel_jfreeblk: blkno %jd", blkno); free_jsegdep(jblkdep->jb_jsegdep); LIST_REMOVE(jblkdep, jb_deps); WORKITEM_FREE(jfreeblk, D_JFREEBLK); } /* * Allocate a new jfreeblk to journal top level block pointer when truncating * a file. The caller must add this to the worklist when the per-filesystem * lock is held. */ static struct jfreeblk * newjfreeblk(freeblks, lbn, blkno, frags) struct freeblks *freeblks; ufs_lbn_t lbn; ufs2_daddr_t blkno; int frags; { struct jfreeblk *jfreeblk; jfreeblk = malloc(sizeof(*jfreeblk), M_JFREEBLK, M_SOFTDEP_FLAGS); workitem_alloc(&jfreeblk->jf_dep.jb_list, D_JFREEBLK, freeblks->fb_list.wk_mp); jfreeblk->jf_dep.jb_jsegdep = newjsegdep(&jfreeblk->jf_dep.jb_list); jfreeblk->jf_dep.jb_freeblks = freeblks; jfreeblk->jf_ino = freeblks->fb_inum; jfreeblk->jf_lbn = lbn; jfreeblk->jf_blkno = blkno; jfreeblk->jf_frags = frags; LIST_INSERT_HEAD(&freeblks->fb_jblkdephd, &jfreeblk->jf_dep, jb_deps); return (jfreeblk); } /* * The journal is only prepared to handle full-size block numbers, so we * have to adjust the record to reflect the change to a full-size block. * For example, suppose we have a block made up of fragments 8-15 and * want to free its last two fragments. We are given a request that says: * FREEBLK ino=5, blkno=14, lbn=0, frags=2, oldfrags=0 * where frags are the number of fragments to free and oldfrags are the * number of fragments to keep. To block align it, we have to change it to * have a valid full-size blkno, so it becomes: * FREEBLK ino=5, blkno=8, lbn=0, frags=2, oldfrags=6 */ static void adjust_newfreework(freeblks, frag_offset) struct freeblks *freeblks; int frag_offset; { struct jfreeblk *jfreeblk; KASSERT((LIST_FIRST(&freeblks->fb_jblkdephd) != NULL && LIST_FIRST(&freeblks->fb_jblkdephd)->jb_list.wk_type == D_JFREEBLK), ("adjust_newfreework: Missing freeblks dependency")); jfreeblk = WK_JFREEBLK(LIST_FIRST(&freeblks->fb_jblkdephd)); jfreeblk->jf_blkno -= frag_offset; jfreeblk->jf_frags += frag_offset; } /* * Allocate a new jtrunc to track a partial truncation. */ static struct jtrunc * newjtrunc(freeblks, size, extsize) struct freeblks *freeblks; off_t size; int extsize; { struct jtrunc *jtrunc; jtrunc = malloc(sizeof(*jtrunc), M_JTRUNC, M_SOFTDEP_FLAGS); workitem_alloc(&jtrunc->jt_dep.jb_list, D_JTRUNC, freeblks->fb_list.wk_mp); jtrunc->jt_dep.jb_jsegdep = newjsegdep(&jtrunc->jt_dep.jb_list); jtrunc->jt_dep.jb_freeblks = freeblks; jtrunc->jt_ino = freeblks->fb_inum; jtrunc->jt_size = size; jtrunc->jt_extsize = extsize; LIST_INSERT_HEAD(&freeblks->fb_jblkdephd, &jtrunc->jt_dep, jb_deps); return (jtrunc); } /* * If we're canceling a new bitmap we have to search for another ref * to move into the bmsafemap dep. This might be better expressed * with another structure. */ static void move_newblock_dep(jaddref, inodedep) struct jaddref *jaddref; struct inodedep *inodedep; { struct inoref *inoref; struct jaddref *jaddrefn; jaddrefn = NULL; for (inoref = TAILQ_NEXT(&jaddref->ja_ref, if_deps); inoref; inoref = TAILQ_NEXT(inoref, if_deps)) { if ((jaddref->ja_state & NEWBLOCK) && inoref->if_list.wk_type == D_JADDREF) { jaddrefn = (struct jaddref *)inoref; break; } } if (jaddrefn == NULL) return; jaddrefn->ja_state &= ~(ATTACHED | UNDONE); jaddrefn->ja_state |= jaddref->ja_state & (ATTACHED | UNDONE | NEWBLOCK); jaddref->ja_state &= ~(ATTACHED | UNDONE | NEWBLOCK); jaddref->ja_state |= ATTACHED; LIST_REMOVE(jaddref, ja_bmdeps); LIST_INSERT_HEAD(&inodedep->id_bmsafemap->sm_jaddrefhd, jaddrefn, ja_bmdeps); } /* * Cancel a jaddref either before it has been written or while it is being * written. This happens when a link is removed before the add reaches * the disk. The jaddref dependency is kept linked into the bmsafemap * and inode to prevent the link count or bitmap from reaching the disk * until handle_workitem_remove() re-adjusts the counts and bitmaps as * required. * * Returns 1 if the canceled addref requires journaling of the remove and * 0 otherwise. */ static int cancel_jaddref(jaddref, inodedep, wkhd) struct jaddref *jaddref; struct inodedep *inodedep; struct workhead *wkhd; { struct inoref *inoref; struct jsegdep *jsegdep; int needsj; KASSERT((jaddref->ja_state & COMPLETE) == 0, ("cancel_jaddref: Canceling complete jaddref")); if (jaddref->ja_state & (INPROGRESS | COMPLETE)) needsj = 1; else needsj = 0; if (inodedep == NULL) if (inodedep_lookup(jaddref->ja_list.wk_mp, jaddref->ja_ino, 0, &inodedep) == 0) panic("cancel_jaddref: Lost inodedep"); /* * We must adjust the nlink of any reference operation that follows * us so that it is consistent with the in-memory reference. This * ensures that inode nlink rollbacks always have the correct link. */ if (needsj == 0) { for (inoref = TAILQ_NEXT(&jaddref->ja_ref, if_deps); inoref; inoref = TAILQ_NEXT(inoref, if_deps)) { if (inoref->if_state & GOINGAWAY) break; inoref->if_nlink--; } } jsegdep = inoref_jseg(&jaddref->ja_ref); if (jaddref->ja_state & NEWBLOCK) move_newblock_dep(jaddref, inodedep); wake_worklist(&jaddref->ja_list); jaddref->ja_mkdir = NULL; if (jaddref->ja_state & INPROGRESS) { jaddref->ja_state &= ~INPROGRESS; WORKLIST_REMOVE(&jaddref->ja_list); jwork_insert(wkhd, jsegdep); } else { free_jsegdep(jsegdep); if (jaddref->ja_state & DEPCOMPLETE) remove_from_journal(&jaddref->ja_list); } jaddref->ja_state |= (GOINGAWAY | DEPCOMPLETE); /* * Leave NEWBLOCK jaddrefs on the inodedep so handle_workitem_remove * can arrange for them to be freed with the bitmap. Otherwise we * no longer need this addref attached to the inoreflst and it * will incorrectly adjust nlink if we leave it. */ if ((jaddref->ja_state & NEWBLOCK) == 0) { TAILQ_REMOVE(&inodedep->id_inoreflst, &jaddref->ja_ref, if_deps); jaddref->ja_state |= COMPLETE; free_jaddref(jaddref); return (needsj); } /* * Leave the head of the list for jsegdeps for fast merging. */ if (LIST_FIRST(wkhd) != NULL) { jaddref->ja_state |= ONWORKLIST; LIST_INSERT_AFTER(LIST_FIRST(wkhd), &jaddref->ja_list, wk_list); } else WORKLIST_INSERT(wkhd, &jaddref->ja_list); return (needsj); } /* * Attempt to free a jaddref structure when some work completes. This * should only succeed once the entry is written and all dependencies have * been notified. */ static void free_jaddref(jaddref) struct jaddref *jaddref; { if ((jaddref->ja_state & ALLCOMPLETE) != ALLCOMPLETE) return; if (jaddref->ja_ref.if_jsegdep) panic("free_jaddref: segdep attached to jaddref %p(0x%X)\n", jaddref, jaddref->ja_state); if (jaddref->ja_state & NEWBLOCK) LIST_REMOVE(jaddref, ja_bmdeps); if (jaddref->ja_state & (INPROGRESS | ONWORKLIST)) panic("free_jaddref: Bad state %p(0x%X)", jaddref, jaddref->ja_state); if (jaddref->ja_mkdir != NULL) panic("free_jaddref: Work pending, 0x%X\n", jaddref->ja_state); WORKITEM_FREE(jaddref, D_JADDREF); } /* * Free a jremref structure once it has been written or discarded. */ static void free_jremref(jremref) struct jremref *jremref; { if (jremref->jr_ref.if_jsegdep) free_jsegdep(jremref->jr_ref.if_jsegdep); if (jremref->jr_state & INPROGRESS) panic("free_jremref: IO still pending"); WORKITEM_FREE(jremref, D_JREMREF); } /* * Free a jnewblk structure. */ static void free_jnewblk(jnewblk) struct jnewblk *jnewblk; { if ((jnewblk->jn_state & ALLCOMPLETE) != ALLCOMPLETE) return; LIST_REMOVE(jnewblk, jn_deps); if (jnewblk->jn_dep != NULL) panic("free_jnewblk: Dependency still attached."); WORKITEM_FREE(jnewblk, D_JNEWBLK); } /* * Cancel a jnewblk which has been been made redundant by frag extension. */ static void cancel_jnewblk(jnewblk, wkhd) struct jnewblk *jnewblk; struct workhead *wkhd; { struct jsegdep *jsegdep; CTR1(KTR_SUJ, "cancel_jnewblk: blkno %jd", jnewblk->jn_blkno); jsegdep = jnewblk->jn_jsegdep; if (jnewblk->jn_jsegdep == NULL || jnewblk->jn_dep == NULL) panic("cancel_jnewblk: Invalid state"); jnewblk->jn_jsegdep = NULL; jnewblk->jn_dep = NULL; jnewblk->jn_state |= GOINGAWAY; if (jnewblk->jn_state & INPROGRESS) { jnewblk->jn_state &= ~INPROGRESS; WORKLIST_REMOVE(&jnewblk->jn_list); jwork_insert(wkhd, jsegdep); } else { free_jsegdep(jsegdep); remove_from_journal(&jnewblk->jn_list); } wake_worklist(&jnewblk->jn_list); WORKLIST_INSERT(wkhd, &jnewblk->jn_list); } static void free_jblkdep(jblkdep) struct jblkdep *jblkdep; { if (jblkdep->jb_list.wk_type == D_JFREEBLK) WORKITEM_FREE(jblkdep, D_JFREEBLK); else if (jblkdep->jb_list.wk_type == D_JTRUNC) WORKITEM_FREE(jblkdep, D_JTRUNC); else panic("free_jblkdep: Unexpected type %s", TYPENAME(jblkdep->jb_list.wk_type)); } /* * Free a single jseg once it is no longer referenced in memory or on * disk. Reclaim journal blocks and dependencies waiting for the segment * to disappear. */ static void free_jseg(jseg, jblocks) struct jseg *jseg; struct jblocks *jblocks; { struct freework *freework; /* * Free freework structures that were lingering to indicate freed * indirect blocks that forced journal write ordering on reallocate. */ while ((freework = LIST_FIRST(&jseg->js_indirs)) != NULL) indirblk_remove(freework); if (jblocks->jb_oldestseg == jseg) jblocks->jb_oldestseg = TAILQ_NEXT(jseg, js_next); TAILQ_REMOVE(&jblocks->jb_segs, jseg, js_next); jblocks_free(jblocks, jseg->js_list.wk_mp, jseg->js_size); KASSERT(LIST_EMPTY(&jseg->js_entries), ("free_jseg: Freed jseg has valid entries.")); WORKITEM_FREE(jseg, D_JSEG); } /* * Free all jsegs that meet the criteria for being reclaimed and update * oldestseg. */ static void free_jsegs(jblocks) struct jblocks *jblocks; { struct jseg *jseg; /* * Free only those jsegs which have none allocated before them to * preserve the journal space ordering. */ while ((jseg = TAILQ_FIRST(&jblocks->jb_segs)) != NULL) { /* * Only reclaim space when nothing depends on this journal * set and another set has written that it is no longer * valid. */ if (jseg->js_refs != 0) { jblocks->jb_oldestseg = jseg; return; } if ((jseg->js_state & ALLCOMPLETE) != ALLCOMPLETE) break; if (jseg->js_seq > jblocks->jb_oldestwrseq) break; /* * We can free jsegs that didn't write entries when * oldestwrseq == js_seq. */ if (jseg->js_seq == jblocks->jb_oldestwrseq && jseg->js_cnt != 0) break; free_jseg(jseg, jblocks); } /* * If we exited the loop above we still must discover the * oldest valid segment. */ if (jseg) for (jseg = jblocks->jb_oldestseg; jseg != NULL; jseg = TAILQ_NEXT(jseg, js_next)) if (jseg->js_refs != 0) break; jblocks->jb_oldestseg = jseg; /* * The journal has no valid records but some jsegs may still be * waiting on oldestwrseq to advance. We force a small record * out to permit these lingering records to be reclaimed. */ if (jblocks->jb_oldestseg == NULL && !TAILQ_EMPTY(&jblocks->jb_segs)) jblocks->jb_needseg = 1; } /* * Release one reference to a jseg and free it if the count reaches 0. This * should eventually reclaim journal space as well. */ static void rele_jseg(jseg) struct jseg *jseg; { KASSERT(jseg->js_refs > 0, ("free_jseg: Invalid refcnt %d", jseg->js_refs)); if (--jseg->js_refs != 0) return; free_jsegs(jseg->js_jblocks); } /* * Release a jsegdep and decrement the jseg count. */ static void free_jsegdep(jsegdep) struct jsegdep *jsegdep; { if (jsegdep->jd_seg) rele_jseg(jsegdep->jd_seg); WORKITEM_FREE(jsegdep, D_JSEGDEP); } /* * Wait for a journal item to make it to disk. Initiate journal processing * if required. */ static int jwait(wk, waitfor) struct worklist *wk; int waitfor; { LOCK_OWNED(VFSTOUFS(wk->wk_mp)); /* * Blocking journal waits cause slow synchronous behavior. Record * stats on the frequency of these blocking operations. */ if (waitfor == MNT_WAIT) { stat_journal_wait++; switch (wk->wk_type) { case D_JREMREF: case D_JMVREF: stat_jwait_filepage++; break; case D_JTRUNC: case D_JFREEBLK: stat_jwait_freeblks++; break; case D_JNEWBLK: stat_jwait_newblk++; break; case D_JADDREF: stat_jwait_inode++; break; default: break; } } /* * If IO has not started we process the journal. We can't mark the * worklist item as IOWAITING because we drop the lock while * processing the journal and the worklist entry may be freed after * this point. The caller may call back in and re-issue the request. */ if ((wk->wk_state & INPROGRESS) == 0) { softdep_process_journal(wk->wk_mp, wk, waitfor); if (waitfor != MNT_WAIT) return (EBUSY); return (0); } if (waitfor != MNT_WAIT) return (EBUSY); wait_worklist(wk, "jwait"); return (0); } /* * Lookup an inodedep based on an inode pointer and set the nlinkdelta as * appropriate. This is a convenience function to reduce duplicate code * for the setup and revert functions below. */ static struct inodedep * inodedep_lookup_ip(ip) struct inode *ip; { struct inodedep *inodedep; KASSERT(ip->i_nlink >= ip->i_effnlink, ("inodedep_lookup_ip: bad delta")); (void) inodedep_lookup(ITOVFS(ip), ip->i_number, DEPALLOC, &inodedep); inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink; KASSERT((inodedep->id_state & UNLINKED) == 0, ("inode unlinked")); return (inodedep); } /* * Called prior to creating a new inode and linking it to a directory. The * jaddref structure must already be allocated by softdep_setup_inomapdep * and it is discovered here so we can initialize the mode and update * nlinkdelta. */ void softdep_setup_create(dp, ip) struct inode *dp; struct inode *ip; { struct inodedep *inodedep; struct jaddref *jaddref; struct vnode *dvp; KASSERT(MOUNTEDSOFTDEP(ITOVFS(dp)) != 0, ("softdep_setup_create called on non-softdep filesystem")); KASSERT(ip->i_nlink == 1, ("softdep_setup_create: Invalid link count.")); dvp = ITOV(dp); ACQUIRE_LOCK(ITOUMP(dp)); inodedep = inodedep_lookup_ip(ip); if (DOINGSUJ(dvp)) { jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst, inoreflst); KASSERT(jaddref != NULL && jaddref->ja_parent == dp->i_number, ("softdep_setup_create: No addref structure present.")); } softdep_prelink(dvp, NULL); FREE_LOCK(ITOUMP(dp)); } /* * Create a jaddref structure to track the addition of a DOTDOT link when * we are reparenting an inode as part of a rename. This jaddref will be * found by softdep_setup_directory_change. Adjusts nlinkdelta for * non-journaling softdep. */ void softdep_setup_dotdot_link(dp, ip) struct inode *dp; struct inode *ip; { struct inodedep *inodedep; struct jaddref *jaddref; struct vnode *dvp; KASSERT(MOUNTEDSOFTDEP(ITOVFS(dp)) != 0, ("softdep_setup_dotdot_link called on non-softdep filesystem")); dvp = ITOV(dp); jaddref = NULL; /* * We don't set MKDIR_PARENT as this is not tied to a mkdir and * is used as a normal link would be. */ if (DOINGSUJ(dvp)) jaddref = newjaddref(ip, dp->i_number, DOTDOT_OFFSET, dp->i_effnlink - 1, dp->i_mode); ACQUIRE_LOCK(ITOUMP(dp)); inodedep = inodedep_lookup_ip(dp); if (jaddref) TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, &jaddref->ja_ref, if_deps); softdep_prelink(dvp, ITOV(ip)); FREE_LOCK(ITOUMP(dp)); } /* * Create a jaddref structure to track a new link to an inode. The directory * offset is not known until softdep_setup_directory_add or * softdep_setup_directory_change. Adjusts nlinkdelta for non-journaling * softdep. */ void softdep_setup_link(dp, ip) struct inode *dp; struct inode *ip; { struct inodedep *inodedep; struct jaddref *jaddref; struct vnode *dvp; KASSERT(MOUNTEDSOFTDEP(ITOVFS(dp)) != 0, ("softdep_setup_link called on non-softdep filesystem")); dvp = ITOV(dp); jaddref = NULL; if (DOINGSUJ(dvp)) jaddref = newjaddref(dp, ip->i_number, 0, ip->i_effnlink - 1, ip->i_mode); ACQUIRE_LOCK(ITOUMP(dp)); inodedep = inodedep_lookup_ip(ip); if (jaddref) TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, &jaddref->ja_ref, if_deps); softdep_prelink(dvp, ITOV(ip)); FREE_LOCK(ITOUMP(dp)); } /* * Called to create the jaddref structures to track . and .. references as * well as lookup and further initialize the incomplete jaddref created * by softdep_setup_inomapdep when the inode was allocated. Adjusts * nlinkdelta for non-journaling softdep. */ void softdep_setup_mkdir(dp, ip) struct inode *dp; struct inode *ip; { struct inodedep *inodedep; struct jaddref *dotdotaddref; struct jaddref *dotaddref; struct jaddref *jaddref; struct vnode *dvp; KASSERT(MOUNTEDSOFTDEP(ITOVFS(dp)) != 0, ("softdep_setup_mkdir called on non-softdep filesystem")); dvp = ITOV(dp); dotaddref = dotdotaddref = NULL; if (DOINGSUJ(dvp)) { dotaddref = newjaddref(ip, ip->i_number, DOT_OFFSET, 1, ip->i_mode); dotaddref->ja_state |= MKDIR_BODY; dotdotaddref = newjaddref(ip, dp->i_number, DOTDOT_OFFSET, dp->i_effnlink - 1, dp->i_mode); dotdotaddref->ja_state |= MKDIR_PARENT; } ACQUIRE_LOCK(ITOUMP(dp)); inodedep = inodedep_lookup_ip(ip); if (DOINGSUJ(dvp)) { jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst, inoreflst); KASSERT(jaddref != NULL, ("softdep_setup_mkdir: No addref structure present.")); KASSERT(jaddref->ja_parent == dp->i_number, ("softdep_setup_mkdir: bad parent %ju", (uintmax_t)jaddref->ja_parent)); TAILQ_INSERT_BEFORE(&jaddref->ja_ref, &dotaddref->ja_ref, if_deps); } inodedep = inodedep_lookup_ip(dp); if (DOINGSUJ(dvp)) TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, &dotdotaddref->ja_ref, if_deps); softdep_prelink(ITOV(dp), NULL); FREE_LOCK(ITOUMP(dp)); } /* * Called to track nlinkdelta of the inode and parent directories prior to * unlinking a directory. */ void softdep_setup_rmdir(dp, ip) struct inode *dp; struct inode *ip; { struct vnode *dvp; KASSERT(MOUNTEDSOFTDEP(ITOVFS(dp)) != 0, ("softdep_setup_rmdir called on non-softdep filesystem")); dvp = ITOV(dp); ACQUIRE_LOCK(ITOUMP(dp)); (void) inodedep_lookup_ip(ip); (void) inodedep_lookup_ip(dp); softdep_prelink(dvp, ITOV(ip)); FREE_LOCK(ITOUMP(dp)); } /* * Called to track nlinkdelta of the inode and parent directories prior to * unlink. */ void softdep_setup_unlink(dp, ip) struct inode *dp; struct inode *ip; { struct vnode *dvp; KASSERT(MOUNTEDSOFTDEP(ITOVFS(dp)) != 0, ("softdep_setup_unlink called on non-softdep filesystem")); dvp = ITOV(dp); ACQUIRE_LOCK(ITOUMP(dp)); (void) inodedep_lookup_ip(ip); (void) inodedep_lookup_ip(dp); softdep_prelink(dvp, ITOV(ip)); FREE_LOCK(ITOUMP(dp)); } /* * Called to release the journal structures created by a failed non-directory * creation. Adjusts nlinkdelta for non-journaling softdep. */ void softdep_revert_create(dp, ip) struct inode *dp; struct inode *ip; { struct inodedep *inodedep; struct jaddref *jaddref; struct vnode *dvp; KASSERT(MOUNTEDSOFTDEP(ITOVFS((dp))) != 0, ("softdep_revert_create called on non-softdep filesystem")); dvp = ITOV(dp); ACQUIRE_LOCK(ITOUMP(dp)); inodedep = inodedep_lookup_ip(ip); if (DOINGSUJ(dvp)) { jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst, inoreflst); KASSERT(jaddref->ja_parent == dp->i_number, ("softdep_revert_create: addref parent mismatch")); cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait); } FREE_LOCK(ITOUMP(dp)); } /* * Called to release the journal structures created by a failed link * addition. Adjusts nlinkdelta for non-journaling softdep. */ void softdep_revert_link(dp, ip) struct inode *dp; struct inode *ip; { struct inodedep *inodedep; struct jaddref *jaddref; struct vnode *dvp; KASSERT(MOUNTEDSOFTDEP(ITOVFS(dp)) != 0, ("softdep_revert_link called on non-softdep filesystem")); dvp = ITOV(dp); ACQUIRE_LOCK(ITOUMP(dp)); inodedep = inodedep_lookup_ip(ip); if (DOINGSUJ(dvp)) { jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst, inoreflst); KASSERT(jaddref->ja_parent == dp->i_number, ("softdep_revert_link: addref parent mismatch")); cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait); } FREE_LOCK(ITOUMP(dp)); } /* * Called to release the journal structures created by a failed mkdir * attempt. Adjusts nlinkdelta for non-journaling softdep. */ void softdep_revert_mkdir(dp, ip) struct inode *dp; struct inode *ip; { struct inodedep *inodedep; struct jaddref *jaddref; struct jaddref *dotaddref; struct vnode *dvp; KASSERT(MOUNTEDSOFTDEP(ITOVFS(dp)) != 0, ("softdep_revert_mkdir called on non-softdep filesystem")); dvp = ITOV(dp); ACQUIRE_LOCK(ITOUMP(dp)); inodedep = inodedep_lookup_ip(dp); if (DOINGSUJ(dvp)) { jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst, inoreflst); KASSERT(jaddref->ja_parent == ip->i_number, ("softdep_revert_mkdir: dotdot addref parent mismatch")); cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait); } inodedep = inodedep_lookup_ip(ip); if (DOINGSUJ(dvp)) { jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst, inoreflst); KASSERT(jaddref->ja_parent == dp->i_number, ("softdep_revert_mkdir: addref parent mismatch")); dotaddref = (struct jaddref *)TAILQ_PREV(&jaddref->ja_ref, inoreflst, if_deps); cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait); KASSERT(dotaddref->ja_parent == ip->i_number, ("softdep_revert_mkdir: dot addref parent mismatch")); cancel_jaddref(dotaddref, inodedep, &inodedep->id_inowait); } FREE_LOCK(ITOUMP(dp)); } /* * Called to correct nlinkdelta after a failed rmdir. */ void softdep_revert_rmdir(dp, ip) struct inode *dp; struct inode *ip; { KASSERT(MOUNTEDSOFTDEP(ITOVFS(dp)) != 0, ("softdep_revert_rmdir called on non-softdep filesystem")); ACQUIRE_LOCK(ITOUMP(dp)); (void) inodedep_lookup_ip(ip); (void) inodedep_lookup_ip(dp); FREE_LOCK(ITOUMP(dp)); } /* * Protecting the freemaps (or bitmaps). * * To eliminate the need to execute fsck before mounting a filesystem * after a power failure, one must (conservatively) guarantee that the * on-disk copy of the bitmaps never indicate that a live inode or block is * free. So, when a block or inode is allocated, the bitmap should be * updated (on disk) before any new pointers. When a block or inode is * freed, the bitmap should not be updated until all pointers have been * reset. The latter dependency is handled by the delayed de-allocation * approach described below for block and inode de-allocation. The former * dependency is handled by calling the following procedure when a block or * inode is allocated. When an inode is allocated an "inodedep" is created * with its DEPCOMPLETE flag cleared until its bitmap is written to disk. * Each "inodedep" is also inserted into the hash indexing structure so * that any additional link additions can be made dependent on the inode * allocation. * * The ufs filesystem maintains a number of free block counts (e.g., per * cylinder group, per cylinder and per pair) * in addition to the bitmaps. These counts are used to improve efficiency * during allocation and therefore must be consistent with the bitmaps. * There is no convenient way to guarantee post-crash consistency of these * counts with simple update ordering, for two main reasons: (1) The counts * and bitmaps for a single cylinder group block are not in the same disk * sector. If a disk write is interrupted (e.g., by power failure), one may * be written and the other not. (2) Some of the counts are located in the * superblock rather than the cylinder group block. So, we focus our soft * updates implementation on protecting the bitmaps. When mounting a * filesystem, we recompute the auxiliary counts from the bitmaps. */ /* * Called just after updating the cylinder group block to allocate an inode. */ void softdep_setup_inomapdep(bp, ip, newinum, mode) struct buf *bp; /* buffer for cylgroup block with inode map */ struct inode *ip; /* inode related to allocation */ ino_t newinum; /* new inode number being allocated */ int mode; { struct inodedep *inodedep; struct bmsafemap *bmsafemap; struct jaddref *jaddref; struct mount *mp; struct fs *fs; mp = ITOVFS(ip); KASSERT(MOUNTEDSOFTDEP(mp) != 0, ("softdep_setup_inomapdep called on non-softdep filesystem")); fs = VFSTOUFS(mp)->um_fs; jaddref = NULL; /* * Allocate the journal reference add structure so that the bitmap * can be dependent on it. */ if (MOUNTEDSUJ(mp)) { jaddref = newjaddref(ip, newinum, 0, 0, mode); jaddref->ja_state |= NEWBLOCK; } /* * Create a dependency for the newly allocated inode. * Panic if it already exists as something is seriously wrong. * Otherwise add it to the dependency list for the buffer holding * the cylinder group map from which it was allocated. * * We have to preallocate a bmsafemap entry in case it is needed * in bmsafemap_lookup since once we allocate the inodedep, we * have to finish initializing it before we can FREE_LOCK(). * By preallocating, we avoid FREE_LOCK() while doing a malloc * in bmsafemap_lookup. We cannot call bmsafemap_lookup before * creating the inodedep as it can be freed during the time * that we FREE_LOCK() while allocating the inodedep. We must * call workitem_alloc() before entering the locked section as * it also acquires the lock and we must avoid trying doing so * recursively. */ bmsafemap = malloc(sizeof(struct bmsafemap), M_BMSAFEMAP, M_SOFTDEP_FLAGS); workitem_alloc(&bmsafemap->sm_list, D_BMSAFEMAP, mp); ACQUIRE_LOCK(ITOUMP(ip)); if ((inodedep_lookup(mp, newinum, DEPALLOC, &inodedep))) panic("softdep_setup_inomapdep: dependency %p for new" "inode already exists", inodedep); bmsafemap = bmsafemap_lookup(mp, bp, ino_to_cg(fs, newinum), bmsafemap); if (jaddref) { LIST_INSERT_HEAD(&bmsafemap->sm_jaddrefhd, jaddref, ja_bmdeps); TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, &jaddref->ja_ref, if_deps); } else { inodedep->id_state |= ONDEPLIST; LIST_INSERT_HEAD(&bmsafemap->sm_inodedephd, inodedep, id_deps); } inodedep->id_bmsafemap = bmsafemap; inodedep->id_state &= ~DEPCOMPLETE; FREE_LOCK(ITOUMP(ip)); } /* * Called just after updating the cylinder group block to * allocate block or fragment. */ void softdep_setup_blkmapdep(bp, mp, newblkno, frags, oldfrags) struct buf *bp; /* buffer for cylgroup block with block map */ struct mount *mp; /* filesystem doing allocation */ ufs2_daddr_t newblkno; /* number of newly allocated block */ int frags; /* Number of fragments. */ int oldfrags; /* Previous number of fragments for extend. */ { struct newblk *newblk; struct bmsafemap *bmsafemap; struct jnewblk *jnewblk; struct ufsmount *ump; struct fs *fs; KASSERT(MOUNTEDSOFTDEP(mp) != 0, ("softdep_setup_blkmapdep called on non-softdep filesystem")); ump = VFSTOUFS(mp); fs = ump->um_fs; jnewblk = NULL; /* * Create a dependency for the newly allocated block. * Add it to the dependency list for the buffer holding * the cylinder group map from which it was allocated. */ if (MOUNTEDSUJ(mp)) { jnewblk = malloc(sizeof(*jnewblk), M_JNEWBLK, M_SOFTDEP_FLAGS); workitem_alloc(&jnewblk->jn_list, D_JNEWBLK, mp); jnewblk->jn_jsegdep = newjsegdep(&jnewblk->jn_list); jnewblk->jn_state = ATTACHED; jnewblk->jn_blkno = newblkno; jnewblk->jn_frags = frags; jnewblk->jn_oldfrags = oldfrags; #ifdef SUJ_DEBUG { struct cg *cgp; uint8_t *blksfree; long bno; int i; cgp = (struct cg *)bp->b_data; blksfree = cg_blksfree(cgp); bno = dtogd(fs, jnewblk->jn_blkno); for (i = jnewblk->jn_oldfrags; i < jnewblk->jn_frags; i++) { if (isset(blksfree, bno + i)) panic("softdep_setup_blkmapdep: " "free fragment %d from %d-%d " "state 0x%X dep %p", i, jnewblk->jn_oldfrags, jnewblk->jn_frags, jnewblk->jn_state, jnewblk->jn_dep); } } #endif } CTR3(KTR_SUJ, "softdep_setup_blkmapdep: blkno %jd frags %d oldfrags %d", newblkno, frags, oldfrags); ACQUIRE_LOCK(ump); if (newblk_lookup(mp, newblkno, DEPALLOC, &newblk) != 0) panic("softdep_setup_blkmapdep: found block"); newblk->nb_bmsafemap = bmsafemap = bmsafemap_lookup(mp, bp, dtog(fs, newblkno), NULL); if (jnewblk) { jnewblk->jn_dep = (struct worklist *)newblk; LIST_INSERT_HEAD(&bmsafemap->sm_jnewblkhd, jnewblk, jn_deps); } else { newblk->nb_state |= ONDEPLIST; LIST_INSERT_HEAD(&bmsafemap->sm_newblkhd, newblk, nb_deps); } newblk->nb_bmsafemap = bmsafemap; newblk->nb_jnewblk = jnewblk; FREE_LOCK(ump); } #define BMSAFEMAP_HASH(ump, cg) \ (&(ump)->bmsafemap_hashtbl[(cg) & (ump)->bmsafemap_hash_size]) static int bmsafemap_find(bmsafemaphd, cg, bmsafemapp) struct bmsafemap_hashhead *bmsafemaphd; int cg; struct bmsafemap **bmsafemapp; { struct bmsafemap *bmsafemap; LIST_FOREACH(bmsafemap, bmsafemaphd, sm_hash) if (bmsafemap->sm_cg == cg) break; if (bmsafemap) { *bmsafemapp = bmsafemap; return (1); } *bmsafemapp = NULL; return (0); } /* * Find the bmsafemap associated with a cylinder group buffer. * If none exists, create one. The buffer must be locked when * this routine is called and this routine must be called with * the softdep lock held. To avoid giving up the lock while * allocating a new bmsafemap, a preallocated bmsafemap may be * provided. If it is provided but not needed, it is freed. */ static struct bmsafemap * bmsafemap_lookup(mp, bp, cg, newbmsafemap) struct mount *mp; struct buf *bp; int cg; struct bmsafemap *newbmsafemap; { struct bmsafemap_hashhead *bmsafemaphd; struct bmsafemap *bmsafemap, *collision; struct worklist *wk; struct ufsmount *ump; ump = VFSTOUFS(mp); LOCK_OWNED(ump); KASSERT(bp != NULL, ("bmsafemap_lookup: missing buffer")); LIST_FOREACH(wk, &bp->b_dep, wk_list) { if (wk->wk_type == D_BMSAFEMAP) { if (newbmsafemap) WORKITEM_FREE(newbmsafemap, D_BMSAFEMAP); return (WK_BMSAFEMAP(wk)); } } bmsafemaphd = BMSAFEMAP_HASH(ump, cg); if (bmsafemap_find(bmsafemaphd, cg, &bmsafemap) == 1) { if (newbmsafemap) WORKITEM_FREE(newbmsafemap, D_BMSAFEMAP); return (bmsafemap); } if (newbmsafemap) { bmsafemap = newbmsafemap; } else { FREE_LOCK(ump); bmsafemap = malloc(sizeof(struct bmsafemap), M_BMSAFEMAP, M_SOFTDEP_FLAGS); workitem_alloc(&bmsafemap->sm_list, D_BMSAFEMAP, mp); ACQUIRE_LOCK(ump); } bmsafemap->sm_buf = bp; LIST_INIT(&bmsafemap->sm_inodedephd); LIST_INIT(&bmsafemap->sm_inodedepwr); LIST_INIT(&bmsafemap->sm_newblkhd); LIST_INIT(&bmsafemap->sm_newblkwr); LIST_INIT(&bmsafemap->sm_jaddrefhd); LIST_INIT(&bmsafemap->sm_jnewblkhd); LIST_INIT(&bmsafemap->sm_freehd); LIST_INIT(&bmsafemap->sm_freewr); if (bmsafemap_find(bmsafemaphd, cg, &collision) == 1) { WORKITEM_FREE(bmsafemap, D_BMSAFEMAP); return (collision); } bmsafemap->sm_cg = cg; LIST_INSERT_HEAD(bmsafemaphd, bmsafemap, sm_hash); LIST_INSERT_HEAD(&ump->softdep_dirtycg, bmsafemap, sm_next); WORKLIST_INSERT(&bp->b_dep, &bmsafemap->sm_list); return (bmsafemap); } /* * Direct block allocation dependencies. * * When a new block is allocated, the corresponding disk locations must be * initialized (with zeros or new data) before the on-disk inode points to * them. Also, the freemap from which the block was allocated must be * updated (on disk) before the inode's pointer. These two dependencies are * independent of each other and are needed for all file blocks and indirect * blocks that are pointed to directly by the inode. Just before the * "in-core" version of the inode is updated with a newly allocated block * number, a procedure (below) is called to setup allocation dependency * structures. These structures are removed when the corresponding * dependencies are satisfied or when the block allocation becomes obsolete * (i.e., the file is deleted, the block is de-allocated, or the block is a * fragment that gets upgraded). All of these cases are handled in * procedures described later. * * When a file extension causes a fragment to be upgraded, either to a larger * fragment or to a full block, the on-disk location may change (if the * previous fragment could not simply be extended). In this case, the old * fragment must be de-allocated, but not until after the inode's pointer has * been updated. In most cases, this is handled by later procedures, which * will construct a "freefrag" structure to be added to the workitem queue * when the inode update is complete (or obsolete). The main exception to * this is when an allocation occurs while a pending allocation dependency * (for the same block pointer) remains. This case is handled in the main * allocation dependency setup procedure by immediately freeing the * unreferenced fragments. */ void softdep_setup_allocdirect(ip, off, newblkno, oldblkno, newsize, oldsize, bp) struct inode *ip; /* inode to which block is being added */ ufs_lbn_t off; /* block pointer within inode */ ufs2_daddr_t newblkno; /* disk block number being added */ ufs2_daddr_t oldblkno; /* previous block number, 0 unless frag */ long newsize; /* size of new block */ long oldsize; /* size of new block */ struct buf *bp; /* bp for allocated block */ { struct allocdirect *adp, *oldadp; struct allocdirectlst *adphead; struct freefrag *freefrag; struct inodedep *inodedep; struct pagedep *pagedep; struct jnewblk *jnewblk; struct newblk *newblk; struct mount *mp; ufs_lbn_t lbn; lbn = bp->b_lblkno; mp = ITOVFS(ip); KASSERT(MOUNTEDSOFTDEP(mp) != 0, ("softdep_setup_allocdirect called on non-softdep filesystem")); if (oldblkno && oldblkno != newblkno) freefrag = newfreefrag(ip, oldblkno, oldsize, lbn); else freefrag = NULL; CTR6(KTR_SUJ, "softdep_setup_allocdirect: ino %d blkno %jd oldblkno %jd " "off %jd newsize %ld oldsize %d", ip->i_number, newblkno, oldblkno, off, newsize, oldsize); ACQUIRE_LOCK(ITOUMP(ip)); - if (off >= NDADDR) { + if (off >= UFS_NDADDR) { if (lbn > 0) panic("softdep_setup_allocdirect: bad lbn %jd, off %jd", lbn, off); /* allocating an indirect block */ if (oldblkno != 0) panic("softdep_setup_allocdirect: non-zero indir"); } else { if (off != lbn) panic("softdep_setup_allocdirect: lbn %jd != off %jd", lbn, off); /* * Allocating a direct block. * * If we are allocating a directory block, then we must * allocate an associated pagedep to track additions and * deletions. */ if ((ip->i_mode & IFMT) == IFDIR) pagedep_lookup(mp, bp, ip->i_number, off, DEPALLOC, &pagedep); } if (newblk_lookup(mp, newblkno, 0, &newblk) == 0) panic("softdep_setup_allocdirect: lost block"); KASSERT(newblk->nb_list.wk_type == D_NEWBLK, ("softdep_setup_allocdirect: newblk already initialized")); /* * Convert the newblk to an allocdirect. */ WORKITEM_REASSIGN(newblk, D_ALLOCDIRECT); adp = (struct allocdirect *)newblk; newblk->nb_freefrag = freefrag; adp->ad_offset = off; adp->ad_oldblkno = oldblkno; adp->ad_newsize = newsize; adp->ad_oldsize = oldsize; /* * Finish initializing the journal. */ if ((jnewblk = newblk->nb_jnewblk) != NULL) { jnewblk->jn_ino = ip->i_number; jnewblk->jn_lbn = lbn; add_to_journal(&jnewblk->jn_list); } if (freefrag && freefrag->ff_jdep != NULL && freefrag->ff_jdep->wk_type == D_JFREEFRAG) add_to_journal(freefrag->ff_jdep); inodedep_lookup(mp, ip->i_number, DEPALLOC, &inodedep); adp->ad_inodedep = inodedep; WORKLIST_INSERT(&bp->b_dep, &newblk->nb_list); /* * The list of allocdirects must be kept in sorted and ascending * order so that the rollback routines can quickly determine the * first uncommitted block (the size of the file stored on disk * ends at the end of the lowest committed fragment, or if there * are no fragments, at the end of the highest committed block). * Since files generally grow, the typical case is that the new * block is to be added at the end of the list. We speed this * special case by checking against the last allocdirect in the * list before laboriously traversing the list looking for the * insertion point. */ adphead = &inodedep->id_newinoupdt; oldadp = TAILQ_LAST(adphead, allocdirectlst); if (oldadp == NULL || oldadp->ad_offset <= off) { /* insert at end of list */ TAILQ_INSERT_TAIL(adphead, adp, ad_next); if (oldadp != NULL && oldadp->ad_offset == off) allocdirect_merge(adphead, adp, oldadp); FREE_LOCK(ITOUMP(ip)); return; } TAILQ_FOREACH(oldadp, adphead, ad_next) { if (oldadp->ad_offset >= off) break; } if (oldadp == NULL) panic("softdep_setup_allocdirect: lost entry"); /* insert in middle of list */ TAILQ_INSERT_BEFORE(oldadp, adp, ad_next); if (oldadp->ad_offset == off) allocdirect_merge(adphead, adp, oldadp); FREE_LOCK(ITOUMP(ip)); } /* * Merge a newer and older journal record to be stored either in a * newblock or freefrag. This handles aggregating journal records for * fragment allocation into a second record as well as replacing a * journal free with an aborted journal allocation. A segment for the * oldest record will be placed on wkhd if it has been written. If not * the segment for the newer record will suffice. */ static struct worklist * jnewblk_merge(new, old, wkhd) struct worklist *new; struct worklist *old; struct workhead *wkhd; { struct jnewblk *njnewblk; struct jnewblk *jnewblk; /* Handle NULLs to simplify callers. */ if (new == NULL) return (old); if (old == NULL) return (new); /* Replace a jfreefrag with a jnewblk. */ if (new->wk_type == D_JFREEFRAG) { if (WK_JNEWBLK(old)->jn_blkno != WK_JFREEFRAG(new)->fr_blkno) panic("jnewblk_merge: blkno mismatch: %p, %p", old, new); cancel_jfreefrag(WK_JFREEFRAG(new)); return (old); } if (old->wk_type != D_JNEWBLK || new->wk_type != D_JNEWBLK) panic("jnewblk_merge: Bad type: old %d new %d\n", old->wk_type, new->wk_type); /* * Handle merging of two jnewblk records that describe * different sets of fragments in the same block. */ jnewblk = WK_JNEWBLK(old); njnewblk = WK_JNEWBLK(new); if (jnewblk->jn_blkno != njnewblk->jn_blkno) panic("jnewblk_merge: Merging disparate blocks."); /* * The record may be rolled back in the cg. */ if (jnewblk->jn_state & UNDONE) { jnewblk->jn_state &= ~UNDONE; njnewblk->jn_state |= UNDONE; njnewblk->jn_state &= ~ATTACHED; } /* * We modify the newer addref and free the older so that if neither * has been written the most up-to-date copy will be on disk. If * both have been written but rolled back we only temporarily need * one of them to fix the bits when the cg write completes. */ jnewblk->jn_state |= ATTACHED | COMPLETE; njnewblk->jn_oldfrags = jnewblk->jn_oldfrags; cancel_jnewblk(jnewblk, wkhd); WORKLIST_REMOVE(&jnewblk->jn_list); free_jnewblk(jnewblk); return (new); } /* * Replace an old allocdirect dependency with a newer one. * This routine must be called with splbio interrupts blocked. */ static void allocdirect_merge(adphead, newadp, oldadp) struct allocdirectlst *adphead; /* head of list holding allocdirects */ struct allocdirect *newadp; /* allocdirect being added */ struct allocdirect *oldadp; /* existing allocdirect being checked */ { struct worklist *wk; struct freefrag *freefrag; freefrag = NULL; LOCK_OWNED(VFSTOUFS(newadp->ad_list.wk_mp)); if (newadp->ad_oldblkno != oldadp->ad_newblkno || newadp->ad_oldsize != oldadp->ad_newsize || - newadp->ad_offset >= NDADDR) + newadp->ad_offset >= UFS_NDADDR) panic("%s %jd != new %jd || old size %ld != new %ld", "allocdirect_merge: old blkno", (intmax_t)newadp->ad_oldblkno, (intmax_t)oldadp->ad_newblkno, newadp->ad_oldsize, oldadp->ad_newsize); newadp->ad_oldblkno = oldadp->ad_oldblkno; newadp->ad_oldsize = oldadp->ad_oldsize; /* * If the old dependency had a fragment to free or had never * previously had a block allocated, then the new dependency * can immediately post its freefrag and adopt the old freefrag. * This action is done by swapping the freefrag dependencies. * The new dependency gains the old one's freefrag, and the * old one gets the new one and then immediately puts it on * the worklist when it is freed by free_newblk. It is * not possible to do this swap when the old dependency had a * non-zero size but no previous fragment to free. This condition * arises when the new block is an extension of the old block. * Here, the first part of the fragment allocated to the new * dependency is part of the block currently claimed on disk by * the old dependency, so cannot legitimately be freed until the * conditions for the new dependency are fulfilled. */ freefrag = newadp->ad_freefrag; if (oldadp->ad_freefrag != NULL || oldadp->ad_oldblkno == 0) { newadp->ad_freefrag = oldadp->ad_freefrag; oldadp->ad_freefrag = freefrag; } /* * If we are tracking a new directory-block allocation, * move it from the old allocdirect to the new allocdirect. */ if ((wk = LIST_FIRST(&oldadp->ad_newdirblk)) != NULL) { WORKLIST_REMOVE(wk); if (!LIST_EMPTY(&oldadp->ad_newdirblk)) panic("allocdirect_merge: extra newdirblk"); WORKLIST_INSERT(&newadp->ad_newdirblk, wk); } TAILQ_REMOVE(adphead, oldadp, ad_next); /* * We need to move any journal dependencies over to the freefrag * that releases this block if it exists. Otherwise we are * extending an existing block and we'll wait until that is * complete to release the journal space and extend the * new journal to cover this old space as well. */ if (freefrag == NULL) { if (oldadp->ad_newblkno != newadp->ad_newblkno) panic("allocdirect_merge: %jd != %jd", oldadp->ad_newblkno, newadp->ad_newblkno); newadp->ad_block.nb_jnewblk = (struct jnewblk *) jnewblk_merge(&newadp->ad_block.nb_jnewblk->jn_list, &oldadp->ad_block.nb_jnewblk->jn_list, &newadp->ad_block.nb_jwork); oldadp->ad_block.nb_jnewblk = NULL; cancel_newblk(&oldadp->ad_block, NULL, &newadp->ad_block.nb_jwork); } else { wk = (struct worklist *) cancel_newblk(&oldadp->ad_block, &freefrag->ff_list, &freefrag->ff_jwork); freefrag->ff_jdep = jnewblk_merge(freefrag->ff_jdep, wk, &freefrag->ff_jwork); } free_newblk(&oldadp->ad_block); } /* * Allocate a jfreefrag structure to journal a single block free. */ static struct jfreefrag * newjfreefrag(freefrag, ip, blkno, size, lbn) struct freefrag *freefrag; struct inode *ip; ufs2_daddr_t blkno; long size; ufs_lbn_t lbn; { struct jfreefrag *jfreefrag; struct fs *fs; fs = ITOFS(ip); jfreefrag = malloc(sizeof(struct jfreefrag), M_JFREEFRAG, M_SOFTDEP_FLAGS); workitem_alloc(&jfreefrag->fr_list, D_JFREEFRAG, ITOVFS(ip)); jfreefrag->fr_jsegdep = newjsegdep(&jfreefrag->fr_list); jfreefrag->fr_state = ATTACHED | DEPCOMPLETE; jfreefrag->fr_ino = ip->i_number; jfreefrag->fr_lbn = lbn; jfreefrag->fr_blkno = blkno; jfreefrag->fr_frags = numfrags(fs, size); jfreefrag->fr_freefrag = freefrag; return (jfreefrag); } /* * Allocate a new freefrag structure. */ static struct freefrag * newfreefrag(ip, blkno, size, lbn) struct inode *ip; ufs2_daddr_t blkno; long size; ufs_lbn_t lbn; { struct freefrag *freefrag; struct ufsmount *ump; struct fs *fs; CTR4(KTR_SUJ, "newfreefrag: ino %d blkno %jd size %ld lbn %jd", ip->i_number, blkno, size, lbn); ump = ITOUMP(ip); fs = ump->um_fs; if (fragnum(fs, blkno) + numfrags(fs, size) > fs->fs_frag) panic("newfreefrag: frag size"); freefrag = malloc(sizeof(struct freefrag), M_FREEFRAG, M_SOFTDEP_FLAGS); workitem_alloc(&freefrag->ff_list, D_FREEFRAG, UFSTOVFS(ump)); freefrag->ff_state = ATTACHED; LIST_INIT(&freefrag->ff_jwork); freefrag->ff_inum = ip->i_number; freefrag->ff_vtype = ITOV(ip)->v_type; freefrag->ff_blkno = blkno; freefrag->ff_fragsize = size; if (MOUNTEDSUJ(UFSTOVFS(ump))) { freefrag->ff_jdep = (struct worklist *) newjfreefrag(freefrag, ip, blkno, size, lbn); } else { freefrag->ff_state |= DEPCOMPLETE; freefrag->ff_jdep = NULL; } return (freefrag); } /* * This workitem de-allocates fragments that were replaced during * file block allocation. */ static void handle_workitem_freefrag(freefrag) struct freefrag *freefrag; { struct ufsmount *ump = VFSTOUFS(freefrag->ff_list.wk_mp); struct workhead wkhd; CTR3(KTR_SUJ, "handle_workitem_freefrag: ino %d blkno %jd size %ld", freefrag->ff_inum, freefrag->ff_blkno, freefrag->ff_fragsize); /* * It would be illegal to add new completion items to the * freefrag after it was schedule to be done so it must be * safe to modify the list head here. */ LIST_INIT(&wkhd); ACQUIRE_LOCK(ump); LIST_SWAP(&freefrag->ff_jwork, &wkhd, worklist, wk_list); /* * If the journal has not been written we must cancel it here. */ if (freefrag->ff_jdep) { if (freefrag->ff_jdep->wk_type != D_JNEWBLK) panic("handle_workitem_freefrag: Unexpected type %d\n", freefrag->ff_jdep->wk_type); cancel_jnewblk(WK_JNEWBLK(freefrag->ff_jdep), &wkhd); } FREE_LOCK(ump); ffs_blkfree(ump, ump->um_fs, ump->um_devvp, freefrag->ff_blkno, freefrag->ff_fragsize, freefrag->ff_inum, freefrag->ff_vtype, &wkhd); ACQUIRE_LOCK(ump); WORKITEM_FREE(freefrag, D_FREEFRAG); FREE_LOCK(ump); } /* * Set up a dependency structure for an external attributes data block. * This routine follows much of the structure of softdep_setup_allocdirect. * See the description of softdep_setup_allocdirect above for details. */ void softdep_setup_allocext(ip, off, newblkno, oldblkno, newsize, oldsize, bp) struct inode *ip; ufs_lbn_t off; ufs2_daddr_t newblkno; ufs2_daddr_t oldblkno; long newsize; long oldsize; struct buf *bp; { struct allocdirect *adp, *oldadp; struct allocdirectlst *adphead; struct freefrag *freefrag; struct inodedep *inodedep; struct jnewblk *jnewblk; struct newblk *newblk; struct mount *mp; struct ufsmount *ump; ufs_lbn_t lbn; mp = ITOVFS(ip); ump = VFSTOUFS(mp); KASSERT(MOUNTEDSOFTDEP(mp) != 0, ("softdep_setup_allocext called on non-softdep filesystem")); - KASSERT(off < NXADDR, ("softdep_setup_allocext: lbn %lld > NXADDR", - (long long)off)); + KASSERT(off < UFS_NXADDR, + ("softdep_setup_allocext: lbn %lld > UFS_NXADDR", (long long)off)); lbn = bp->b_lblkno; if (oldblkno && oldblkno != newblkno) freefrag = newfreefrag(ip, oldblkno, oldsize, lbn); else freefrag = NULL; ACQUIRE_LOCK(ump); if (newblk_lookup(mp, newblkno, 0, &newblk) == 0) panic("softdep_setup_allocext: lost block"); KASSERT(newblk->nb_list.wk_type == D_NEWBLK, ("softdep_setup_allocext: newblk already initialized")); /* * Convert the newblk to an allocdirect. */ WORKITEM_REASSIGN(newblk, D_ALLOCDIRECT); adp = (struct allocdirect *)newblk; newblk->nb_freefrag = freefrag; adp->ad_offset = off; adp->ad_oldblkno = oldblkno; adp->ad_newsize = newsize; adp->ad_oldsize = oldsize; adp->ad_state |= EXTDATA; /* * Finish initializing the journal. */ if ((jnewblk = newblk->nb_jnewblk) != NULL) { jnewblk->jn_ino = ip->i_number; jnewblk->jn_lbn = lbn; add_to_journal(&jnewblk->jn_list); } if (freefrag && freefrag->ff_jdep != NULL && freefrag->ff_jdep->wk_type == D_JFREEFRAG) add_to_journal(freefrag->ff_jdep); inodedep_lookup(mp, ip->i_number, DEPALLOC, &inodedep); adp->ad_inodedep = inodedep; WORKLIST_INSERT(&bp->b_dep, &newblk->nb_list); /* * The list of allocdirects must be kept in sorted and ascending * order so that the rollback routines can quickly determine the * first uncommitted block (the size of the file stored on disk * ends at the end of the lowest committed fragment, or if there * are no fragments, at the end of the highest committed block). * Since files generally grow, the typical case is that the new * block is to be added at the end of the list. We speed this * special case by checking against the last allocdirect in the * list before laboriously traversing the list looking for the * insertion point. */ adphead = &inodedep->id_newextupdt; oldadp = TAILQ_LAST(adphead, allocdirectlst); if (oldadp == NULL || oldadp->ad_offset <= off) { /* insert at end of list */ TAILQ_INSERT_TAIL(adphead, adp, ad_next); if (oldadp != NULL && oldadp->ad_offset == off) allocdirect_merge(adphead, adp, oldadp); FREE_LOCK(ump); return; } TAILQ_FOREACH(oldadp, adphead, ad_next) { if (oldadp->ad_offset >= off) break; } if (oldadp == NULL) panic("softdep_setup_allocext: lost entry"); /* insert in middle of list */ TAILQ_INSERT_BEFORE(oldadp, adp, ad_next); if (oldadp->ad_offset == off) allocdirect_merge(adphead, adp, oldadp); FREE_LOCK(ump); } /* * Indirect block allocation dependencies. * * The same dependencies that exist for a direct block also exist when * a new block is allocated and pointed to by an entry in a block of * indirect pointers. The undo/redo states described above are also * used here. Because an indirect block contains many pointers that * may have dependencies, a second copy of the entire in-memory indirect * block is kept. The buffer cache copy is always completely up-to-date. * The second copy, which is used only as a source for disk writes, * contains only the safe pointers (i.e., those that have no remaining * update dependencies). The second copy is freed when all pointers * are safe. The cache is not allowed to replace indirect blocks with * pending update dependencies. If a buffer containing an indirect * block with dependencies is written, these routines will mark it * dirty again. It can only be successfully written once all the * dependencies are removed. The ffs_fsync routine in conjunction with * softdep_sync_metadata work together to get all the dependencies * removed so that a file can be successfully written to disk. Three * procedures are used when setting up indirect block pointer * dependencies. The division is necessary because of the organization * of the "balloc" routine and because of the distinction between file * pages and file metadata blocks. */ /* * Allocate a new allocindir structure. */ static struct allocindir * newallocindir(ip, ptrno, newblkno, oldblkno, lbn) struct inode *ip; /* inode for file being extended */ int ptrno; /* offset of pointer in indirect block */ ufs2_daddr_t newblkno; /* disk block number being added */ ufs2_daddr_t oldblkno; /* previous block number, 0 if none */ ufs_lbn_t lbn; { struct newblk *newblk; struct allocindir *aip; struct freefrag *freefrag; struct jnewblk *jnewblk; if (oldblkno) freefrag = newfreefrag(ip, oldblkno, ITOFS(ip)->fs_bsize, lbn); else freefrag = NULL; ACQUIRE_LOCK(ITOUMP(ip)); if (newblk_lookup(ITOVFS(ip), newblkno, 0, &newblk) == 0) panic("new_allocindir: lost block"); KASSERT(newblk->nb_list.wk_type == D_NEWBLK, ("newallocindir: newblk already initialized")); WORKITEM_REASSIGN(newblk, D_ALLOCINDIR); newblk->nb_freefrag = freefrag; aip = (struct allocindir *)newblk; aip->ai_offset = ptrno; aip->ai_oldblkno = oldblkno; aip->ai_lbn = lbn; if ((jnewblk = newblk->nb_jnewblk) != NULL) { jnewblk->jn_ino = ip->i_number; jnewblk->jn_lbn = lbn; add_to_journal(&jnewblk->jn_list); } if (freefrag && freefrag->ff_jdep != NULL && freefrag->ff_jdep->wk_type == D_JFREEFRAG) add_to_journal(freefrag->ff_jdep); return (aip); } /* * Called just before setting an indirect block pointer * to a newly allocated file page. */ void softdep_setup_allocindir_page(ip, lbn, bp, ptrno, newblkno, oldblkno, nbp) struct inode *ip; /* inode for file being extended */ ufs_lbn_t lbn; /* allocated block number within file */ struct buf *bp; /* buffer with indirect blk referencing page */ int ptrno; /* offset of pointer in indirect block */ ufs2_daddr_t newblkno; /* disk block number being added */ ufs2_daddr_t oldblkno; /* previous block number, 0 if none */ struct buf *nbp; /* buffer holding allocated page */ { struct inodedep *inodedep; struct freefrag *freefrag; struct allocindir *aip; struct pagedep *pagedep; struct mount *mp; struct ufsmount *ump; mp = ITOVFS(ip); ump = VFSTOUFS(mp); KASSERT(MOUNTEDSOFTDEP(mp) != 0, ("softdep_setup_allocindir_page called on non-softdep filesystem")); KASSERT(lbn == nbp->b_lblkno, ("softdep_setup_allocindir_page: lbn %jd != lblkno %jd", lbn, bp->b_lblkno)); CTR4(KTR_SUJ, "softdep_setup_allocindir_page: ino %d blkno %jd oldblkno %jd " "lbn %jd", ip->i_number, newblkno, oldblkno, lbn); ASSERT_VOP_LOCKED(ITOV(ip), "softdep_setup_allocindir_page"); aip = newallocindir(ip, ptrno, newblkno, oldblkno, lbn); (void) inodedep_lookup(mp, ip->i_number, DEPALLOC, &inodedep); /* * If we are allocating a directory page, then we must * allocate an associated pagedep to track additions and * deletions. */ if ((ip->i_mode & IFMT) == IFDIR) pagedep_lookup(mp, nbp, ip->i_number, lbn, DEPALLOC, &pagedep); WORKLIST_INSERT(&nbp->b_dep, &aip->ai_block.nb_list); freefrag = setup_allocindir_phase2(bp, ip, inodedep, aip, lbn); FREE_LOCK(ump); if (freefrag) handle_workitem_freefrag(freefrag); } /* * Called just before setting an indirect block pointer to a * newly allocated indirect block. */ void softdep_setup_allocindir_meta(nbp, ip, bp, ptrno, newblkno) struct buf *nbp; /* newly allocated indirect block */ struct inode *ip; /* inode for file being extended */ struct buf *bp; /* indirect block referencing allocated block */ int ptrno; /* offset of pointer in indirect block */ ufs2_daddr_t newblkno; /* disk block number being added */ { struct inodedep *inodedep; struct allocindir *aip; struct ufsmount *ump; ufs_lbn_t lbn; ump = ITOUMP(ip); KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(ump)) != 0, ("softdep_setup_allocindir_meta called on non-softdep filesystem")); CTR3(KTR_SUJ, "softdep_setup_allocindir_meta: ino %d blkno %jd ptrno %d", ip->i_number, newblkno, ptrno); lbn = nbp->b_lblkno; ASSERT_VOP_LOCKED(ITOV(ip), "softdep_setup_allocindir_meta"); aip = newallocindir(ip, ptrno, newblkno, 0, lbn); inodedep_lookup(UFSTOVFS(ump), ip->i_number, DEPALLOC, &inodedep); WORKLIST_INSERT(&nbp->b_dep, &aip->ai_block.nb_list); if (setup_allocindir_phase2(bp, ip, inodedep, aip, lbn)) panic("softdep_setup_allocindir_meta: Block already existed"); FREE_LOCK(ump); } static void indirdep_complete(indirdep) struct indirdep *indirdep; { struct allocindir *aip; LIST_REMOVE(indirdep, ir_next); indirdep->ir_state |= DEPCOMPLETE; while ((aip = LIST_FIRST(&indirdep->ir_completehd)) != NULL) { LIST_REMOVE(aip, ai_next); free_newblk(&aip->ai_block); } /* * If this indirdep is not attached to a buf it was simply waiting * on completion to clear completehd. free_indirdep() asserts * that nothing is dangling. */ if ((indirdep->ir_state & ONWORKLIST) == 0) free_indirdep(indirdep); } static struct indirdep * indirdep_lookup(mp, ip, bp) struct mount *mp; struct inode *ip; struct buf *bp; { struct indirdep *indirdep, *newindirdep; struct newblk *newblk; struct ufsmount *ump; struct worklist *wk; struct fs *fs; ufs2_daddr_t blkno; ump = VFSTOUFS(mp); LOCK_OWNED(ump); indirdep = NULL; newindirdep = NULL; fs = ump->um_fs; for (;;) { LIST_FOREACH(wk, &bp->b_dep, wk_list) { if (wk->wk_type != D_INDIRDEP) continue; indirdep = WK_INDIRDEP(wk); break; } /* Found on the buffer worklist, no new structure to free. */ if (indirdep != NULL && newindirdep == NULL) return (indirdep); if (indirdep != NULL && newindirdep != NULL) panic("indirdep_lookup: simultaneous create"); /* None found on the buffer and a new structure is ready. */ if (indirdep == NULL && newindirdep != NULL) break; /* None found and no new structure available. */ FREE_LOCK(ump); newindirdep = malloc(sizeof(struct indirdep), M_INDIRDEP, M_SOFTDEP_FLAGS); workitem_alloc(&newindirdep->ir_list, D_INDIRDEP, mp); newindirdep->ir_state = ATTACHED; if (I_IS_UFS1(ip)) newindirdep->ir_state |= UFS1FMT; TAILQ_INIT(&newindirdep->ir_trunc); newindirdep->ir_saveddata = NULL; LIST_INIT(&newindirdep->ir_deplisthd); LIST_INIT(&newindirdep->ir_donehd); LIST_INIT(&newindirdep->ir_writehd); LIST_INIT(&newindirdep->ir_completehd); if (bp->b_blkno == bp->b_lblkno) { ufs_bmaparray(bp->b_vp, bp->b_lblkno, &blkno, bp, NULL, NULL); bp->b_blkno = blkno; } newindirdep->ir_freeblks = NULL; newindirdep->ir_savebp = getblk(ump->um_devvp, bp->b_blkno, bp->b_bcount, 0, 0, 0); newindirdep->ir_bp = bp; BUF_KERNPROC(newindirdep->ir_savebp); bcopy(bp->b_data, newindirdep->ir_savebp->b_data, bp->b_bcount); ACQUIRE_LOCK(ump); } indirdep = newindirdep; WORKLIST_INSERT(&bp->b_dep, &indirdep->ir_list); /* * If the block is not yet allocated we don't set DEPCOMPLETE so * that we don't free dependencies until the pointers are valid. * This could search b_dep for D_ALLOCDIRECT/D_ALLOCINDIR rather * than using the hash. */ if (newblk_lookup(mp, dbtofsb(fs, bp->b_blkno), 0, &newblk)) LIST_INSERT_HEAD(&newblk->nb_indirdeps, indirdep, ir_next); else indirdep->ir_state |= DEPCOMPLETE; return (indirdep); } /* * Called to finish the allocation of the "aip" allocated * by one of the two routines above. */ static struct freefrag * setup_allocindir_phase2(bp, ip, inodedep, aip, lbn) struct buf *bp; /* in-memory copy of the indirect block */ struct inode *ip; /* inode for file being extended */ struct inodedep *inodedep; /* Inodedep for ip */ struct allocindir *aip; /* allocindir allocated by the above routines */ ufs_lbn_t lbn; /* Logical block number for this block. */ { struct fs *fs; struct indirdep *indirdep; struct allocindir *oldaip; struct freefrag *freefrag; struct mount *mp; struct ufsmount *ump; mp = ITOVFS(ip); ump = VFSTOUFS(mp); LOCK_OWNED(ump); fs = ump->um_fs; if (bp->b_lblkno >= 0) panic("setup_allocindir_phase2: not indir blk"); KASSERT(aip->ai_offset >= 0 && aip->ai_offset < NINDIR(fs), ("setup_allocindir_phase2: Bad offset %d", aip->ai_offset)); indirdep = indirdep_lookup(mp, ip, bp); KASSERT(indirdep->ir_savebp != NULL, ("setup_allocindir_phase2 NULL ir_savebp")); aip->ai_indirdep = indirdep; /* * Check for an unwritten dependency for this indirect offset. If * there is, merge the old dependency into the new one. This happens * as a result of reallocblk only. */ freefrag = NULL; if (aip->ai_oldblkno != 0) { LIST_FOREACH(oldaip, &indirdep->ir_deplisthd, ai_next) { if (oldaip->ai_offset == aip->ai_offset) { freefrag = allocindir_merge(aip, oldaip); goto done; } } LIST_FOREACH(oldaip, &indirdep->ir_donehd, ai_next) { if (oldaip->ai_offset == aip->ai_offset) { freefrag = allocindir_merge(aip, oldaip); goto done; } } } done: LIST_INSERT_HEAD(&indirdep->ir_deplisthd, aip, ai_next); return (freefrag); } /* * Merge two allocindirs which refer to the same block. Move newblock * dependencies and setup the freefrags appropriately. */ static struct freefrag * allocindir_merge(aip, oldaip) struct allocindir *aip; struct allocindir *oldaip; { struct freefrag *freefrag; struct worklist *wk; if (oldaip->ai_newblkno != aip->ai_oldblkno) panic("allocindir_merge: blkno"); aip->ai_oldblkno = oldaip->ai_oldblkno; freefrag = aip->ai_freefrag; aip->ai_freefrag = oldaip->ai_freefrag; oldaip->ai_freefrag = NULL; KASSERT(freefrag != NULL, ("setup_allocindir_phase2: No freefrag")); /* * If we are tracking a new directory-block allocation, * move it from the old allocindir to the new allocindir. */ if ((wk = LIST_FIRST(&oldaip->ai_newdirblk)) != NULL) { WORKLIST_REMOVE(wk); if (!LIST_EMPTY(&oldaip->ai_newdirblk)) panic("allocindir_merge: extra newdirblk"); WORKLIST_INSERT(&aip->ai_newdirblk, wk); } /* * We can skip journaling for this freefrag and just complete * any pending journal work for the allocindir that is being * removed after the freefrag completes. */ if (freefrag->ff_jdep) cancel_jfreefrag(WK_JFREEFRAG(freefrag->ff_jdep)); LIST_REMOVE(oldaip, ai_next); freefrag->ff_jdep = (struct worklist *)cancel_newblk(&oldaip->ai_block, &freefrag->ff_list, &freefrag->ff_jwork); free_newblk(&oldaip->ai_block); return (freefrag); } static inline void setup_freedirect(freeblks, ip, i, needj) struct freeblks *freeblks; struct inode *ip; int i; int needj; { struct ufsmount *ump; ufs2_daddr_t blkno; int frags; blkno = DIP(ip, i_db[i]); if (blkno == 0) return; DIP_SET(ip, i_db[i], 0); ump = ITOUMP(ip); frags = sblksize(ump->um_fs, ip->i_size, i); frags = numfrags(ump->um_fs, frags); newfreework(ump, freeblks, NULL, i, blkno, frags, 0, needj); } static inline void setup_freeext(freeblks, ip, i, needj) struct freeblks *freeblks; struct inode *ip; int i; int needj; { struct ufsmount *ump; ufs2_daddr_t blkno; int frags; blkno = ip->i_din2->di_extb[i]; if (blkno == 0) return; ip->i_din2->di_extb[i] = 0; ump = ITOUMP(ip); frags = sblksize(ump->um_fs, ip->i_din2->di_extsize, i); frags = numfrags(ump->um_fs, frags); newfreework(ump, freeblks, NULL, -1 - i, blkno, frags, 0, needj); } static inline void setup_freeindir(freeblks, ip, i, lbn, needj) struct freeblks *freeblks; struct inode *ip; int i; ufs_lbn_t lbn; int needj; { struct ufsmount *ump; ufs2_daddr_t blkno; blkno = DIP(ip, i_ib[i]); if (blkno == 0) return; DIP_SET(ip, i_ib[i], 0); ump = ITOUMP(ip); newfreework(ump, freeblks, NULL, lbn, blkno, ump->um_fs->fs_frag, 0, needj); } static inline struct freeblks * newfreeblks(mp, ip) struct mount *mp; struct inode *ip; { struct freeblks *freeblks; freeblks = malloc(sizeof(struct freeblks), M_FREEBLKS, M_SOFTDEP_FLAGS|M_ZERO); workitem_alloc(&freeblks->fb_list, D_FREEBLKS, mp); LIST_INIT(&freeblks->fb_jblkdephd); LIST_INIT(&freeblks->fb_jwork); freeblks->fb_ref = 0; freeblks->fb_cgwait = 0; freeblks->fb_state = ATTACHED; freeblks->fb_uid = ip->i_uid; freeblks->fb_inum = ip->i_number; freeblks->fb_vtype = ITOV(ip)->v_type; freeblks->fb_modrev = DIP(ip, i_modrev); freeblks->fb_devvp = ITODEVVP(ip); freeblks->fb_chkcnt = 0; freeblks->fb_len = 0; return (freeblks); } static void trunc_indirdep(indirdep, freeblks, bp, off) struct indirdep *indirdep; struct freeblks *freeblks; struct buf *bp; int off; { struct allocindir *aip, *aipn; /* * The first set of allocindirs won't be in savedbp. */ LIST_FOREACH_SAFE(aip, &indirdep->ir_deplisthd, ai_next, aipn) if (aip->ai_offset > off) cancel_allocindir(aip, bp, freeblks, 1); LIST_FOREACH_SAFE(aip, &indirdep->ir_donehd, ai_next, aipn) if (aip->ai_offset > off) cancel_allocindir(aip, bp, freeblks, 1); /* * These will exist in savedbp. */ LIST_FOREACH_SAFE(aip, &indirdep->ir_writehd, ai_next, aipn) if (aip->ai_offset > off) cancel_allocindir(aip, NULL, freeblks, 0); LIST_FOREACH_SAFE(aip, &indirdep->ir_completehd, ai_next, aipn) if (aip->ai_offset > off) cancel_allocindir(aip, NULL, freeblks, 0); } /* * Follow the chain of indirects down to lastlbn creating a freework * structure for each. This will be used to start indir_trunc() at * the right offset and create the journal records for the parrtial * truncation. A second step will handle the truncated dependencies. */ static int setup_trunc_indir(freeblks, ip, lbn, lastlbn, blkno) struct freeblks *freeblks; struct inode *ip; ufs_lbn_t lbn; ufs_lbn_t lastlbn; ufs2_daddr_t blkno; { struct indirdep *indirdep; struct indirdep *indirn; struct freework *freework; struct newblk *newblk; struct mount *mp; struct ufsmount *ump; struct buf *bp; uint8_t *start; uint8_t *end; ufs_lbn_t lbnadd; int level; int error; int off; freework = NULL; if (blkno == 0) return (0); mp = freeblks->fb_list.wk_mp; ump = VFSTOUFS(mp); bp = getblk(ITOV(ip), lbn, mp->mnt_stat.f_iosize, 0, 0, 0); if ((bp->b_flags & B_CACHE) == 0) { bp->b_blkno = blkptrtodb(VFSTOUFS(mp), blkno); bp->b_iocmd = BIO_READ; bp->b_flags &= ~B_INVAL; bp->b_ioflags &= ~BIO_ERROR; vfs_busy_pages(bp, 0); bp->b_iooffset = dbtob(bp->b_blkno); bstrategy(bp); #ifdef RACCT if (racct_enable) { PROC_LOCK(curproc); racct_add_buf(curproc, bp, 0); PROC_UNLOCK(curproc); } #endif /* RACCT */ curthread->td_ru.ru_inblock++; error = bufwait(bp); if (error) { brelse(bp); return (error); } } level = lbn_level(lbn); lbnadd = lbn_offset(ump->um_fs, level); /* * Compute the offset of the last block we want to keep. Store * in the freework the first block we want to completely free. */ off = (lastlbn - -(lbn + level)) / lbnadd; if (off + 1 == NINDIR(ump->um_fs)) goto nowork; freework = newfreework(ump, freeblks, NULL, lbn, blkno, 0, off + 1, 0); /* * Link the freework into the indirdep. This will prevent any new * allocations from proceeding until we are finished with the * truncate and the block is written. */ ACQUIRE_LOCK(ump); indirdep = indirdep_lookup(mp, ip, bp); if (indirdep->ir_freeblks) panic("setup_trunc_indir: indirdep already truncated."); TAILQ_INSERT_TAIL(&indirdep->ir_trunc, freework, fw_next); freework->fw_indir = indirdep; /* * Cancel any allocindirs that will not make it to disk. * We have to do this for all copies of the indirdep that * live on this newblk. */ if ((indirdep->ir_state & DEPCOMPLETE) == 0) { newblk_lookup(mp, dbtofsb(ump->um_fs, bp->b_blkno), 0, &newblk); LIST_FOREACH(indirn, &newblk->nb_indirdeps, ir_next) trunc_indirdep(indirn, freeblks, bp, off); } else trunc_indirdep(indirdep, freeblks, bp, off); FREE_LOCK(ump); /* * Creation is protected by the buf lock. The saveddata is only * needed if a full truncation follows a partial truncation but it * is difficult to allocate in that case so we fetch it anyway. */ if (indirdep->ir_saveddata == NULL) indirdep->ir_saveddata = malloc(bp->b_bcount, M_INDIRDEP, M_SOFTDEP_FLAGS); nowork: /* Fetch the blkno of the child and the zero start offset. */ if (I_IS_UFS1(ip)) { blkno = ((ufs1_daddr_t *)bp->b_data)[off]; start = (uint8_t *)&((ufs1_daddr_t *)bp->b_data)[off+1]; } else { blkno = ((ufs2_daddr_t *)bp->b_data)[off]; start = (uint8_t *)&((ufs2_daddr_t *)bp->b_data)[off+1]; } if (freework) { /* Zero the truncated pointers. */ end = bp->b_data + bp->b_bcount; bzero(start, end - start); bdwrite(bp); } else bqrelse(bp); if (level == 0) return (0); lbn++; /* adjust level */ lbn -= (off * lbnadd); return setup_trunc_indir(freeblks, ip, lbn, lastlbn, blkno); } /* * Complete the partial truncation of an indirect block setup by * setup_trunc_indir(). This zeros the truncated pointers in the saved * copy and writes them to disk before the freeblks is allowed to complete. */ static void complete_trunc_indir(freework) struct freework *freework; { struct freework *fwn; struct indirdep *indirdep; struct ufsmount *ump; struct buf *bp; uintptr_t start; int count; ump = VFSTOUFS(freework->fw_list.wk_mp); LOCK_OWNED(ump); indirdep = freework->fw_indir; for (;;) { bp = indirdep->ir_bp; /* See if the block was discarded. */ if (bp == NULL) break; /* Inline part of getdirtybuf(). We dont want bremfree. */ if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) == 0) break; if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, LOCK_PTR(ump)) == 0) BUF_UNLOCK(bp); ACQUIRE_LOCK(ump); } freework->fw_state |= DEPCOMPLETE; TAILQ_REMOVE(&indirdep->ir_trunc, freework, fw_next); /* * Zero the pointers in the saved copy. */ if (indirdep->ir_state & UFS1FMT) start = sizeof(ufs1_daddr_t); else start = sizeof(ufs2_daddr_t); start *= freework->fw_start; count = indirdep->ir_savebp->b_bcount - start; start += (uintptr_t)indirdep->ir_savebp->b_data; bzero((char *)start, count); /* * We need to start the next truncation in the list if it has not * been started yet. */ fwn = TAILQ_FIRST(&indirdep->ir_trunc); if (fwn != NULL) { if (fwn->fw_freeblks == indirdep->ir_freeblks) TAILQ_REMOVE(&indirdep->ir_trunc, fwn, fw_next); if ((fwn->fw_state & ONWORKLIST) == 0) freework_enqueue(fwn); } /* * If bp is NULL the block was fully truncated, restore * the saved block list otherwise free it if it is no * longer needed. */ if (TAILQ_EMPTY(&indirdep->ir_trunc)) { if (bp == NULL) bcopy(indirdep->ir_saveddata, indirdep->ir_savebp->b_data, indirdep->ir_savebp->b_bcount); free(indirdep->ir_saveddata, M_INDIRDEP); indirdep->ir_saveddata = NULL; } /* * When bp is NULL there is a full truncation pending. We * must wait for this full truncation to be journaled before * we can release this freework because the disk pointers will * never be written as zero. */ if (bp == NULL) { if (LIST_EMPTY(&indirdep->ir_freeblks->fb_jblkdephd)) handle_written_freework(freework); else WORKLIST_INSERT(&indirdep->ir_freeblks->fb_freeworkhd, &freework->fw_list); } else { /* Complete when the real copy is written. */ WORKLIST_INSERT(&bp->b_dep, &freework->fw_list); BUF_UNLOCK(bp); } } /* * Calculate the number of blocks we are going to release where datablocks * is the current total and length is the new file size. */ static ufs2_daddr_t blkcount(fs, datablocks, length) struct fs *fs; ufs2_daddr_t datablocks; off_t length; { off_t totblks, numblks; totblks = 0; numblks = howmany(length, fs->fs_bsize); - if (numblks <= NDADDR) { + if (numblks <= UFS_NDADDR) { totblks = howmany(length, fs->fs_fsize); goto out; } totblks = blkstofrags(fs, numblks); - numblks -= NDADDR; + numblks -= UFS_NDADDR; /* * Count all single, then double, then triple indirects required. * Subtracting one indirects worth of blocks for each pass * acknowledges one of each pointed to by the inode. */ for (;;) { totblks += blkstofrags(fs, howmany(numblks, NINDIR(fs))); numblks -= NINDIR(fs); if (numblks <= 0) break; numblks = howmany(numblks, NINDIR(fs)); } out: totblks = fsbtodb(fs, totblks); /* * Handle sparse files. We can't reclaim more blocks than the inode * references. We will correct it later in handle_complete_freeblks() * when we know the real count. */ if (totblks > datablocks) return (0); return (datablocks - totblks); } /* * Handle freeblocks for journaled softupdate filesystems. * * Contrary to normal softupdates, we must preserve the block pointers in * indirects until their subordinates are free. This is to avoid journaling * every block that is freed which may consume more space than the journal * itself. The recovery program will see the free block journals at the * base of the truncated area and traverse them to reclaim space. The * pointers in the inode may be cleared immediately after the journal * records are written because each direct and indirect pointer in the * inode is recorded in a journal. This permits full truncation to proceed * asynchronously. The write order is journal -> inode -> cgs -> indirects. * * The algorithm is as follows: * 1) Traverse the in-memory state and create journal entries to release * the relevant blocks and full indirect trees. * 2) Traverse the indirect block chain adding partial truncation freework * records to indirects in the path to lastlbn. The freework will * prevent new allocation dependencies from being satisfied in this * indirect until the truncation completes. * 3) Read and lock the inode block, performing an update with the new size * and pointers. This prevents truncated data from becoming valid on * disk through step 4. * 4) Reap unsatisfied dependencies that are beyond the truncated area, * eliminate journal work for those records that do not require it. * 5) Schedule the journal records to be written followed by the inode block. * 6) Allocate any necessary frags for the end of file. * 7) Zero any partially truncated blocks. * * From this truncation proceeds asynchronously using the freework and * indir_trunc machinery. The file will not be extended again into a * partially truncated indirect block until all work is completed but * the normal dependency mechanism ensures that it is rolled back/forward * as appropriate. Further truncation may occur without delay and is * serialized in indir_trunc(). */ void softdep_journal_freeblocks(ip, cred, length, flags) struct inode *ip; /* The inode whose length is to be reduced */ struct ucred *cred; off_t length; /* The new length for the file */ int flags; /* IO_EXT and/or IO_NORMAL */ { struct freeblks *freeblks, *fbn; struct worklist *wk, *wkn; struct inodedep *inodedep; struct jblkdep *jblkdep; struct allocdirect *adp, *adpn; struct ufsmount *ump; struct fs *fs; struct buf *bp; struct vnode *vp; struct mount *mp; ufs2_daddr_t extblocks, datablocks; ufs_lbn_t tmpval, lbn, lastlbn; int frags, lastoff, iboff, allocblock, needj, error, i; ump = ITOUMP(ip); mp = UFSTOVFS(ump); fs = ump->um_fs; KASSERT(MOUNTEDSOFTDEP(mp) != 0, ("softdep_journal_freeblocks called on non-softdep filesystem")); vp = ITOV(ip); needj = 1; iboff = -1; allocblock = 0; extblocks = 0; datablocks = 0; frags = 0; freeblks = newfreeblks(mp, ip); ACQUIRE_LOCK(ump); /* * If we're truncating a removed file that will never be written * we don't need to journal the block frees. The canceled journals * for the allocations will suffice. */ inodedep_lookup(mp, ip->i_number, DEPALLOC, &inodedep); if ((inodedep->id_state & (UNLINKED | DEPCOMPLETE)) == UNLINKED && length == 0) needj = 0; CTR3(KTR_SUJ, "softdep_journal_freeblks: ip %d length %ld needj %d", ip->i_number, length, needj); FREE_LOCK(ump); /* * Calculate the lbn that we are truncating to. This results in -1 * if we're truncating the 0 bytes. So it is the last lbn we want * to keep, not the first lbn we want to truncate. */ lastlbn = lblkno(fs, length + fs->fs_bsize - 1) - 1; lastoff = blkoff(fs, length); /* * Compute frags we are keeping in lastlbn. 0 means all. */ - if (lastlbn >= 0 && lastlbn < NDADDR) { + if (lastlbn >= 0 && lastlbn < UFS_NDADDR) { frags = fragroundup(fs, lastoff); /* adp offset of last valid allocdirect. */ iboff = lastlbn; } else if (lastlbn > 0) - iboff = NDADDR; + iboff = UFS_NDADDR; if (fs->fs_magic == FS_UFS2_MAGIC) extblocks = btodb(fragroundup(fs, ip->i_din2->di_extsize)); /* * Handle normal data blocks and indirects. This section saves * values used after the inode update to complete frag and indirect * truncation. */ if ((flags & IO_NORMAL) != 0) { /* * Handle truncation of whole direct and indirect blocks. */ - for (i = iboff + 1; i < NDADDR; i++) + for (i = iboff + 1; i < UFS_NDADDR; i++) setup_freedirect(freeblks, ip, i, needj); - for (i = 0, tmpval = NINDIR(fs), lbn = NDADDR; i < NIADDR; + for (i = 0, tmpval = NINDIR(fs), lbn = UFS_NDADDR; + i < UFS_NIADDR; i++, lbn += tmpval, tmpval *= NINDIR(fs)) { /* Release a whole indirect tree. */ if (lbn > lastlbn) { setup_freeindir(freeblks, ip, i, -lbn -i, needj); continue; } - iboff = i + NDADDR; + iboff = i + UFS_NDADDR; /* * Traverse partially truncated indirect tree. */ if (lbn <= lastlbn && lbn + tmpval - 1 > lastlbn) setup_trunc_indir(freeblks, ip, -lbn - i, lastlbn, DIP(ip, i_ib[i])); } /* * Handle partial truncation to a frag boundary. */ if (frags) { ufs2_daddr_t blkno; long oldfrags; oldfrags = blksize(fs, ip, lastlbn); blkno = DIP(ip, i_db[lastlbn]); if (blkno && oldfrags != frags) { oldfrags -= frags; oldfrags = numfrags(fs, oldfrags); blkno += numfrags(fs, frags); newfreework(ump, freeblks, NULL, lastlbn, blkno, oldfrags, 0, needj); if (needj) adjust_newfreework(freeblks, numfrags(fs, frags)); } else if (blkno == 0) allocblock = 1; } /* * Add a journal record for partial truncate if we are * handling indirect blocks. Non-indirects need no extra * journaling. */ - if (length != 0 && lastlbn >= NDADDR) { + if (length != 0 && lastlbn >= UFS_NDADDR) { ip->i_flag |= IN_TRUNCATED; newjtrunc(freeblks, length, 0); } ip->i_size = length; DIP_SET(ip, i_size, ip->i_size); datablocks = DIP(ip, i_blocks) - extblocks; if (length != 0) datablocks = blkcount(fs, datablocks, length); freeblks->fb_len = length; } if ((flags & IO_EXT) != 0) { - for (i = 0; i < NXADDR; i++) + for (i = 0; i < UFS_NXADDR; i++) setup_freeext(freeblks, ip, i, needj); ip->i_din2->di_extsize = 0; datablocks += extblocks; } #ifdef QUOTA /* Reference the quotas in case the block count is wrong in the end. */ quotaref(vp, freeblks->fb_quota); (void) chkdq(ip, -datablocks, NOCRED, 0); #endif freeblks->fb_chkcnt = -datablocks; UFS_LOCK(ump); fs->fs_pendingblocks += datablocks; UFS_UNLOCK(ump); DIP_SET(ip, i_blocks, DIP(ip, i_blocks) - datablocks); /* * Handle truncation of incomplete alloc direct dependencies. We * hold the inode block locked to prevent incomplete dependencies * from reaching the disk while we are eliminating those that * have been truncated. This is a partially inlined ffs_update(). */ ufs_itimes(vp); ip->i_flag &= ~(IN_LAZYACCESS | IN_LAZYMOD | IN_MODIFIED); error = bread(ump->um_devvp, fsbtodb(fs, ino_to_fsba(fs, ip->i_number)), (int)fs->fs_bsize, cred, &bp); if (error) { brelse(bp); softdep_error("softdep_journal_freeblocks", error); return; } if (bp->b_bufsize == fs->fs_bsize) bp->b_flags |= B_CLUSTEROK; softdep_update_inodeblock(ip, bp, 0); if (ump->um_fstype == UFS1) *((struct ufs1_dinode *)bp->b_data + ino_to_fsbo(fs, ip->i_number)) = *ip->i_din1; else *((struct ufs2_dinode *)bp->b_data + ino_to_fsbo(fs, ip->i_number)) = *ip->i_din2; ACQUIRE_LOCK(ump); (void) inodedep_lookup(mp, ip->i_number, DEPALLOC, &inodedep); if ((inodedep->id_state & IOSTARTED) != 0) panic("softdep_setup_freeblocks: inode busy"); /* * Add the freeblks structure to the list of operations that * must await the zero'ed inode being written to disk. If we * still have a bitmap dependency (needj), then the inode * has never been written to disk, so we can process the * freeblks below once we have deleted the dependencies. */ if (needj) WORKLIST_INSERT(&bp->b_dep, &freeblks->fb_list); else freeblks->fb_state |= COMPLETE; if ((flags & IO_NORMAL) != 0) { TAILQ_FOREACH_SAFE(adp, &inodedep->id_inoupdt, ad_next, adpn) { if (adp->ad_offset > iboff) cancel_allocdirect(&inodedep->id_inoupdt, adp, freeblks); /* * Truncate the allocdirect. We could eliminate * or modify journal records as well. */ else if (adp->ad_offset == iboff && frags) adp->ad_newsize = frags; } } if ((flags & IO_EXT) != 0) while ((adp = TAILQ_FIRST(&inodedep->id_extupdt)) != NULL) cancel_allocdirect(&inodedep->id_extupdt, adp, freeblks); /* * Scan the bufwait list for newblock dependencies that will never * make it to disk. */ LIST_FOREACH_SAFE(wk, &inodedep->id_bufwait, wk_list, wkn) { if (wk->wk_type != D_ALLOCDIRECT) continue; adp = WK_ALLOCDIRECT(wk); if (((flags & IO_NORMAL) != 0 && (adp->ad_offset > iboff)) || ((flags & IO_EXT) != 0 && (adp->ad_state & EXTDATA))) { cancel_jfreeblk(freeblks, adp->ad_newblkno); cancel_newblk(WK_NEWBLK(wk), NULL, &freeblks->fb_jwork); WORKLIST_INSERT(&freeblks->fb_freeworkhd, wk); } } /* * Add journal work. */ LIST_FOREACH(jblkdep, &freeblks->fb_jblkdephd, jb_deps) add_to_journal(&jblkdep->jb_list); FREE_LOCK(ump); bdwrite(bp); /* * Truncate dependency structures beyond length. */ trunc_dependencies(ip, freeblks, lastlbn, frags, flags); /* * This is only set when we need to allocate a fragment because * none existed at the end of a frag-sized file. It handles only * allocating a new, zero filled block. */ if (allocblock) { ip->i_size = length - lastoff; DIP_SET(ip, i_size, ip->i_size); error = UFS_BALLOC(vp, length - 1, 1, cred, BA_CLRBUF, &bp); if (error != 0) { softdep_error("softdep_journal_freeblks", error); return; } ip->i_size = length; DIP_SET(ip, i_size, length); ip->i_flag |= IN_CHANGE | IN_UPDATE; allocbuf(bp, frags); ffs_update(vp, 0); bawrite(bp); } else if (lastoff != 0 && vp->v_type != VDIR) { int size; /* * Zero the end of a truncated frag or block. */ size = sblksize(fs, length, lastlbn); error = bread(vp, lastlbn, size, cred, &bp); if (error) { softdep_error("softdep_journal_freeblks", error); return; } bzero((char *)bp->b_data + lastoff, size - lastoff); bawrite(bp); } ACQUIRE_LOCK(ump); inodedep_lookup(mp, ip->i_number, DEPALLOC, &inodedep); TAILQ_INSERT_TAIL(&inodedep->id_freeblklst, freeblks, fb_next); freeblks->fb_state |= DEPCOMPLETE | ONDEPLIST; /* * We zero earlier truncations so they don't erroneously * update i_blocks. */ if (freeblks->fb_len == 0 && (flags & IO_NORMAL) != 0) TAILQ_FOREACH(fbn, &inodedep->id_freeblklst, fb_next) fbn->fb_len = 0; if ((freeblks->fb_state & ALLCOMPLETE) == ALLCOMPLETE && LIST_EMPTY(&freeblks->fb_jblkdephd)) freeblks->fb_state |= INPROGRESS; else freeblks = NULL; FREE_LOCK(ump); if (freeblks) handle_workitem_freeblocks(freeblks, 0); trunc_pages(ip, length, extblocks, flags); } /* * Flush a JOP_SYNC to the journal. */ void softdep_journal_fsync(ip) struct inode *ip; { struct jfsync *jfsync; struct ufsmount *ump; ump = ITOUMP(ip); KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(ump)) != 0, ("softdep_journal_fsync called on non-softdep filesystem")); if ((ip->i_flag & IN_TRUNCATED) == 0) return; ip->i_flag &= ~IN_TRUNCATED; jfsync = malloc(sizeof(*jfsync), M_JFSYNC, M_SOFTDEP_FLAGS | M_ZERO); workitem_alloc(&jfsync->jfs_list, D_JFSYNC, UFSTOVFS(ump)); jfsync->jfs_size = ip->i_size; jfsync->jfs_ino = ip->i_number; ACQUIRE_LOCK(ump); add_to_journal(&jfsync->jfs_list); jwait(&jfsync->jfs_list, MNT_WAIT); FREE_LOCK(ump); } /* * Block de-allocation dependencies. * * When blocks are de-allocated, the on-disk pointers must be nullified before * the blocks are made available for use by other files. (The true * requirement is that old pointers must be nullified before new on-disk * pointers are set. We chose this slightly more stringent requirement to * reduce complexity.) Our implementation handles this dependency by updating * the inode (or indirect block) appropriately but delaying the actual block * de-allocation (i.e., freemap and free space count manipulation) until * after the updated versions reach stable storage. After the disk is * updated, the blocks can be safely de-allocated whenever it is convenient. * This implementation handles only the common case of reducing a file's * length to zero. Other cases are handled by the conventional synchronous * write approach. * * The ffs implementation with which we worked double-checks * the state of the block pointers and file size as it reduces * a file's length. Some of this code is replicated here in our * soft updates implementation. The freeblks->fb_chkcnt field is * used to transfer a part of this information to the procedure * that eventually de-allocates the blocks. * * This routine should be called from the routine that shortens * a file's length, before the inode's size or block pointers * are modified. It will save the block pointer information for * later release and zero the inode so that the calling routine * can release it. */ void softdep_setup_freeblocks(ip, length, flags) struct inode *ip; /* The inode whose length is to be reduced */ off_t length; /* The new length for the file */ int flags; /* IO_EXT and/or IO_NORMAL */ { struct ufs1_dinode *dp1; struct ufs2_dinode *dp2; struct freeblks *freeblks; struct inodedep *inodedep; struct allocdirect *adp; struct ufsmount *ump; struct buf *bp; struct fs *fs; ufs2_daddr_t extblocks, datablocks; struct mount *mp; int i, delay, error; ufs_lbn_t tmpval; ufs_lbn_t lbn; ump = ITOUMP(ip); mp = UFSTOVFS(ump); KASSERT(MOUNTEDSOFTDEP(mp) != 0, ("softdep_setup_freeblocks called on non-softdep filesystem")); CTR2(KTR_SUJ, "softdep_setup_freeblks: ip %d length %ld", ip->i_number, length); KASSERT(length == 0, ("softdep_setup_freeblocks: non-zero length")); fs = ump->um_fs; if ((error = bread(ump->um_devvp, fsbtodb(fs, ino_to_fsba(fs, ip->i_number)), (int)fs->fs_bsize, NOCRED, &bp)) != 0) { brelse(bp); softdep_error("softdep_setup_freeblocks", error); return; } freeblks = newfreeblks(mp, ip); extblocks = 0; datablocks = 0; if (fs->fs_magic == FS_UFS2_MAGIC) extblocks = btodb(fragroundup(fs, ip->i_din2->di_extsize)); if ((flags & IO_NORMAL) != 0) { - for (i = 0; i < NDADDR; i++) + for (i = 0; i < UFS_NDADDR; i++) setup_freedirect(freeblks, ip, i, 0); - for (i = 0, tmpval = NINDIR(fs), lbn = NDADDR; i < NIADDR; + for (i = 0, tmpval = NINDIR(fs), lbn = UFS_NDADDR; + i < UFS_NIADDR; i++, lbn += tmpval, tmpval *= NINDIR(fs)) setup_freeindir(freeblks, ip, i, -lbn -i, 0); ip->i_size = 0; DIP_SET(ip, i_size, 0); datablocks = DIP(ip, i_blocks) - extblocks; } if ((flags & IO_EXT) != 0) { - for (i = 0; i < NXADDR; i++) + for (i = 0; i < UFS_NXADDR; i++) setup_freeext(freeblks, ip, i, 0); ip->i_din2->di_extsize = 0; datablocks += extblocks; } #ifdef QUOTA /* Reference the quotas in case the block count is wrong in the end. */ quotaref(ITOV(ip), freeblks->fb_quota); (void) chkdq(ip, -datablocks, NOCRED, 0); #endif freeblks->fb_chkcnt = -datablocks; UFS_LOCK(ump); fs->fs_pendingblocks += datablocks; UFS_UNLOCK(ump); DIP_SET(ip, i_blocks, DIP(ip, i_blocks) - datablocks); /* * Push the zero'ed inode to to its disk buffer so that we are free * to delete its dependencies below. Once the dependencies are gone * the buffer can be safely released. */ if (ump->um_fstype == UFS1) { dp1 = ((struct ufs1_dinode *)bp->b_data + ino_to_fsbo(fs, ip->i_number)); ip->i_din1->di_freelink = dp1->di_freelink; *dp1 = *ip->i_din1; } else { dp2 = ((struct ufs2_dinode *)bp->b_data + ino_to_fsbo(fs, ip->i_number)); ip->i_din2->di_freelink = dp2->di_freelink; *dp2 = *ip->i_din2; } /* * Find and eliminate any inode dependencies. */ ACQUIRE_LOCK(ump); (void) inodedep_lookup(mp, ip->i_number, DEPALLOC, &inodedep); if ((inodedep->id_state & IOSTARTED) != 0) panic("softdep_setup_freeblocks: inode busy"); /* * Add the freeblks structure to the list of operations that * must await the zero'ed inode being written to disk. If we * still have a bitmap dependency (delay == 0), then the inode * has never been written to disk, so we can process the * freeblks below once we have deleted the dependencies. */ delay = (inodedep->id_state & DEPCOMPLETE); if (delay) WORKLIST_INSERT(&bp->b_dep, &freeblks->fb_list); else freeblks->fb_state |= COMPLETE; /* * Because the file length has been truncated to zero, any * pending block allocation dependency structures associated * with this inode are obsolete and can simply be de-allocated. * We must first merge the two dependency lists to get rid of * any duplicate freefrag structures, then purge the merged list. * If we still have a bitmap dependency, then the inode has never * been written to disk, so we can free any fragments without delay. */ if (flags & IO_NORMAL) { merge_inode_lists(&inodedep->id_newinoupdt, &inodedep->id_inoupdt); while ((adp = TAILQ_FIRST(&inodedep->id_inoupdt)) != NULL) cancel_allocdirect(&inodedep->id_inoupdt, adp, freeblks); } if (flags & IO_EXT) { merge_inode_lists(&inodedep->id_newextupdt, &inodedep->id_extupdt); while ((adp = TAILQ_FIRST(&inodedep->id_extupdt)) != NULL) cancel_allocdirect(&inodedep->id_extupdt, adp, freeblks); } FREE_LOCK(ump); bdwrite(bp); trunc_dependencies(ip, freeblks, -1, 0, flags); ACQUIRE_LOCK(ump); if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) != 0) (void) free_inodedep(inodedep); freeblks->fb_state |= DEPCOMPLETE; /* * If the inode with zeroed block pointers is now on disk * we can start freeing blocks. */ if ((freeblks->fb_state & ALLCOMPLETE) == ALLCOMPLETE) freeblks->fb_state |= INPROGRESS; else freeblks = NULL; FREE_LOCK(ump); if (freeblks) handle_workitem_freeblocks(freeblks, 0); trunc_pages(ip, length, extblocks, flags); } /* * Eliminate pages from the page cache that back parts of this inode and * adjust the vnode pager's idea of our size. This prevents stale data * from hanging around in the page cache. */ static void trunc_pages(ip, length, extblocks, flags) struct inode *ip; off_t length; ufs2_daddr_t extblocks; int flags; { struct vnode *vp; struct fs *fs; ufs_lbn_t lbn; off_t end, extend; vp = ITOV(ip); fs = ITOFS(ip); extend = OFF_TO_IDX(lblktosize(fs, -extblocks)); if ((flags & IO_EXT) != 0) vn_pages_remove(vp, extend, 0); if ((flags & IO_NORMAL) == 0) return; BO_LOCK(&vp->v_bufobj); drain_output(vp); BO_UNLOCK(&vp->v_bufobj); /* * The vnode pager eliminates file pages we eliminate indirects * below. */ vnode_pager_setsize(vp, length); /* * Calculate the end based on the last indirect we want to keep. If * the block extends into indirects we can just use the negative of * its lbn. Doubles and triples exist at lower numbers so we must * be careful not to remove those, if they exist. double and triple * indirect lbns do not overlap with others so it is not important * to verify how many levels are required. */ lbn = lblkno(fs, length); - if (lbn >= NDADDR) { + if (lbn >= UFS_NDADDR) { /* Calculate the virtual lbn of the triple indirect. */ - lbn = -lbn - (NIADDR - 1); + lbn = -lbn - (UFS_NIADDR - 1); end = OFF_TO_IDX(lblktosize(fs, lbn)); } else end = extend; vn_pages_remove(vp, OFF_TO_IDX(OFF_MAX), end); } /* * See if the buf bp is in the range eliminated by truncation. */ static int trunc_check_buf(bp, blkoffp, lastlbn, lastoff, flags) struct buf *bp; int *blkoffp; ufs_lbn_t lastlbn; int lastoff; int flags; { ufs_lbn_t lbn; *blkoffp = 0; /* Only match ext/normal blocks as appropriate. */ if (((flags & IO_EXT) == 0 && (bp->b_xflags & BX_ALTDATA)) || ((flags & IO_NORMAL) == 0 && (bp->b_xflags & BX_ALTDATA) == 0)) return (0); /* ALTDATA is always a full truncation. */ if ((bp->b_xflags & BX_ALTDATA) != 0) return (1); /* -1 is full truncation. */ if (lastlbn == -1) return (1); /* * If this is a partial truncate we only want those * blocks and indirect blocks that cover the range * we're after. */ lbn = bp->b_lblkno; if (lbn < 0) lbn = -(lbn + lbn_level(lbn)); if (lbn < lastlbn) return (0); /* Here we only truncate lblkno if it's partial. */ if (lbn == lastlbn) { if (lastoff == 0) return (0); *blkoffp = lastoff; } return (1); } /* * Eliminate any dependencies that exist in memory beyond lblkno:off */ static void trunc_dependencies(ip, freeblks, lastlbn, lastoff, flags) struct inode *ip; struct freeblks *freeblks; ufs_lbn_t lastlbn; int lastoff; int flags; { struct bufobj *bo; struct vnode *vp; struct buf *bp; int blkoff; /* * We must wait for any I/O in progress to finish so that * all potential buffers on the dirty list will be visible. * Once they are all there, walk the list and get rid of * any dependencies. */ vp = ITOV(ip); bo = &vp->v_bufobj; BO_LOCK(bo); drain_output(vp); TAILQ_FOREACH(bp, &bo->bo_dirty.bv_hd, b_bobufs) bp->b_vflags &= ~BV_SCANNED; restart: TAILQ_FOREACH(bp, &bo->bo_dirty.bv_hd, b_bobufs) { if (bp->b_vflags & BV_SCANNED) continue; if (!trunc_check_buf(bp, &blkoff, lastlbn, lastoff, flags)) { bp->b_vflags |= BV_SCANNED; continue; } KASSERT(bp->b_bufobj == bo, ("Wrong object in buffer")); if ((bp = getdirtybuf(bp, BO_LOCKPTR(bo), MNT_WAIT)) == NULL) goto restart; BO_UNLOCK(bo); if (deallocate_dependencies(bp, freeblks, blkoff)) bqrelse(bp); else brelse(bp); BO_LOCK(bo); goto restart; } /* * Now do the work of vtruncbuf while also matching indirect blocks. */ TAILQ_FOREACH(bp, &bo->bo_clean.bv_hd, b_bobufs) bp->b_vflags &= ~BV_SCANNED; cleanrestart: TAILQ_FOREACH(bp, &bo->bo_clean.bv_hd, b_bobufs) { if (bp->b_vflags & BV_SCANNED) continue; if (!trunc_check_buf(bp, &blkoff, lastlbn, lastoff, flags)) { bp->b_vflags |= BV_SCANNED; continue; } if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, BO_LOCKPTR(bo)) == ENOLCK) { BO_LOCK(bo); goto cleanrestart; } bp->b_vflags |= BV_SCANNED; bremfree(bp); if (blkoff != 0) { allocbuf(bp, blkoff); bqrelse(bp); } else { bp->b_flags |= B_INVAL | B_NOCACHE | B_RELBUF; brelse(bp); } BO_LOCK(bo); goto cleanrestart; } drain_output(vp); BO_UNLOCK(bo); } static int cancel_pagedep(pagedep, freeblks, blkoff) struct pagedep *pagedep; struct freeblks *freeblks; int blkoff; { struct jremref *jremref; struct jmvref *jmvref; struct dirrem *dirrem, *tmp; int i; /* * Copy any directory remove dependencies to the list * to be processed after the freeblks proceeds. If * directory entry never made it to disk they * can be dumped directly onto the work list. */ LIST_FOREACH_SAFE(dirrem, &pagedep->pd_dirremhd, dm_next, tmp) { /* Skip this directory removal if it is intended to remain. */ if (dirrem->dm_offset < blkoff) continue; /* * If there are any dirrems we wait for the journal write * to complete and then restart the buf scan as the lock * has been dropped. */ while ((jremref = LIST_FIRST(&dirrem->dm_jremrefhd)) != NULL) { jwait(&jremref->jr_list, MNT_WAIT); return (ERESTART); } LIST_REMOVE(dirrem, dm_next); dirrem->dm_dirinum = pagedep->pd_ino; WORKLIST_INSERT(&freeblks->fb_freeworkhd, &dirrem->dm_list); } while ((jmvref = LIST_FIRST(&pagedep->pd_jmvrefhd)) != NULL) { jwait(&jmvref->jm_list, MNT_WAIT); return (ERESTART); } /* * When we're partially truncating a pagedep we just want to flush * journal entries and return. There can not be any adds in the * truncated portion of the directory and newblk must remain if * part of the block remains. */ if (blkoff != 0) { struct diradd *dap; LIST_FOREACH(dap, &pagedep->pd_pendinghd, da_pdlist) if (dap->da_offset > blkoff) panic("cancel_pagedep: diradd %p off %d > %d", dap, dap->da_offset, blkoff); for (i = 0; i < DAHASHSZ; i++) LIST_FOREACH(dap, &pagedep->pd_diraddhd[i], da_pdlist) if (dap->da_offset > blkoff) panic("cancel_pagedep: diradd %p off %d > %d", dap, dap->da_offset, blkoff); return (0); } /* * There should be no directory add dependencies present * as the directory could not be truncated until all * children were removed. */ KASSERT(LIST_FIRST(&pagedep->pd_pendinghd) == NULL, ("deallocate_dependencies: pendinghd != NULL")); for (i = 0; i < DAHASHSZ; i++) KASSERT(LIST_FIRST(&pagedep->pd_diraddhd[i]) == NULL, ("deallocate_dependencies: diraddhd != NULL")); if ((pagedep->pd_state & NEWBLOCK) != 0) free_newdirblk(pagedep->pd_newdirblk); if (free_pagedep(pagedep) == 0) panic("Failed to free pagedep %p", pagedep); return (0); } /* * Reclaim any dependency structures from a buffer that is about to * be reallocated to a new vnode. The buffer must be locked, thus, * no I/O completion operations can occur while we are manipulating * its associated dependencies. The mutex is held so that other I/O's * associated with related dependencies do not occur. */ static int deallocate_dependencies(bp, freeblks, off) struct buf *bp; struct freeblks *freeblks; int off; { struct indirdep *indirdep; struct pagedep *pagedep; struct worklist *wk, *wkn; struct ufsmount *ump; if ((wk = LIST_FIRST(&bp->b_dep)) == NULL) goto done; ump = VFSTOUFS(wk->wk_mp); ACQUIRE_LOCK(ump); LIST_FOREACH_SAFE(wk, &bp->b_dep, wk_list, wkn) { switch (wk->wk_type) { case D_INDIRDEP: indirdep = WK_INDIRDEP(wk); if (bp->b_lblkno >= 0 || bp->b_blkno != indirdep->ir_savebp->b_lblkno) panic("deallocate_dependencies: not indir"); cancel_indirdep(indirdep, bp, freeblks); continue; case D_PAGEDEP: pagedep = WK_PAGEDEP(wk); if (cancel_pagedep(pagedep, freeblks, off)) { FREE_LOCK(ump); return (ERESTART); } continue; case D_ALLOCINDIR: /* * Simply remove the allocindir, we'll find it via * the indirdep where we can clear pointers if * needed. */ WORKLIST_REMOVE(wk); continue; case D_FREEWORK: /* * A truncation is waiting for the zero'd pointers * to be written. It can be freed when the freeblks * is journaled. */ WORKLIST_REMOVE(wk); wk->wk_state |= ONDEPLIST; WORKLIST_INSERT(&freeblks->fb_freeworkhd, wk); break; case D_ALLOCDIRECT: if (off != 0) continue; /* FALLTHROUGH */ default: panic("deallocate_dependencies: Unexpected type %s", TYPENAME(wk->wk_type)); /* NOTREACHED */ } } FREE_LOCK(ump); done: /* * Don't throw away this buf, we were partially truncating and * some deps may always remain. */ if (off) { allocbuf(bp, off); bp->b_vflags |= BV_SCANNED; return (EBUSY); } bp->b_flags |= B_INVAL | B_NOCACHE; return (0); } /* * An allocdirect is being canceled due to a truncate. We must make sure * the journal entry is released in concert with the blkfree that releases * the storage. Completed journal entries must not be released until the * space is no longer pointed to by the inode or in the bitmap. */ static void cancel_allocdirect(adphead, adp, freeblks) struct allocdirectlst *adphead; struct allocdirect *adp; struct freeblks *freeblks; { struct freework *freework; struct newblk *newblk; struct worklist *wk; TAILQ_REMOVE(adphead, adp, ad_next); newblk = (struct newblk *)adp; freework = NULL; /* * Find the correct freework structure. */ LIST_FOREACH(wk, &freeblks->fb_freeworkhd, wk_list) { if (wk->wk_type != D_FREEWORK) continue; freework = WK_FREEWORK(wk); if (freework->fw_blkno == newblk->nb_newblkno) break; } if (freework == NULL) panic("cancel_allocdirect: Freework not found"); /* * If a newblk exists at all we still have the journal entry that * initiated the allocation so we do not need to journal the free. */ cancel_jfreeblk(freeblks, freework->fw_blkno); /* * If the journal hasn't been written the jnewblk must be passed * to the call to ffs_blkfree that reclaims the space. We accomplish * this by linking the journal dependency into the freework to be * freed when freework_freeblock() is called. If the journal has * been written we can simply reclaim the journal space when the * freeblks work is complete. */ freework->fw_jnewblk = cancel_newblk(newblk, &freework->fw_list, &freeblks->fb_jwork); WORKLIST_INSERT(&freeblks->fb_freeworkhd, &newblk->nb_list); } /* * Cancel a new block allocation. May be an indirect or direct block. We * remove it from various lists and return any journal record that needs to * be resolved by the caller. * * A special consideration is made for indirects which were never pointed * at on disk and will never be found once this block is released. */ static struct jnewblk * cancel_newblk(newblk, wk, wkhd) struct newblk *newblk; struct worklist *wk; struct workhead *wkhd; { struct jnewblk *jnewblk; CTR1(KTR_SUJ, "cancel_newblk: blkno %jd", newblk->nb_newblkno); newblk->nb_state |= GOINGAWAY; /* * Previously we traversed the completedhd on each indirdep * attached to this newblk to cancel them and gather journal * work. Since we need only the oldest journal segment and * the lowest point on the tree will always have the oldest * journal segment we are free to release the segments * of any subordinates and may leave the indirdep list to * indirdep_complete() when this newblk is freed. */ if (newblk->nb_state & ONDEPLIST) { newblk->nb_state &= ~ONDEPLIST; LIST_REMOVE(newblk, nb_deps); } if (newblk->nb_state & ONWORKLIST) WORKLIST_REMOVE(&newblk->nb_list); /* * If the journal entry hasn't been written we save a pointer to * the dependency that frees it until it is written or the * superseding operation completes. */ jnewblk = newblk->nb_jnewblk; if (jnewblk != NULL && wk != NULL) { newblk->nb_jnewblk = NULL; jnewblk->jn_dep = wk; } if (!LIST_EMPTY(&newblk->nb_jwork)) jwork_move(wkhd, &newblk->nb_jwork); /* * When truncating we must free the newdirblk early to remove * the pagedep from the hash before returning. */ if ((wk = LIST_FIRST(&newblk->nb_newdirblk)) != NULL) free_newdirblk(WK_NEWDIRBLK(wk)); if (!LIST_EMPTY(&newblk->nb_newdirblk)) panic("cancel_newblk: extra newdirblk"); return (jnewblk); } /* * Schedule the freefrag associated with a newblk to be released once * the pointers are written and the previous block is no longer needed. */ static void newblk_freefrag(newblk) struct newblk *newblk; { struct freefrag *freefrag; if (newblk->nb_freefrag == NULL) return; freefrag = newblk->nb_freefrag; newblk->nb_freefrag = NULL; freefrag->ff_state |= COMPLETE; if ((freefrag->ff_state & ALLCOMPLETE) == ALLCOMPLETE) add_to_worklist(&freefrag->ff_list, 0); } /* * Free a newblk. Generate a new freefrag work request if appropriate. * This must be called after the inode pointer and any direct block pointers * are valid or fully removed via truncate or frag extension. */ static void free_newblk(newblk) struct newblk *newblk; { struct indirdep *indirdep; struct worklist *wk; KASSERT(newblk->nb_jnewblk == NULL, ("free_newblk: jnewblk %p still attached", newblk->nb_jnewblk)); KASSERT(newblk->nb_list.wk_type != D_NEWBLK, ("free_newblk: unclaimed newblk")); LOCK_OWNED(VFSTOUFS(newblk->nb_list.wk_mp)); newblk_freefrag(newblk); if (newblk->nb_state & ONDEPLIST) LIST_REMOVE(newblk, nb_deps); if (newblk->nb_state & ONWORKLIST) WORKLIST_REMOVE(&newblk->nb_list); LIST_REMOVE(newblk, nb_hash); if ((wk = LIST_FIRST(&newblk->nb_newdirblk)) != NULL) free_newdirblk(WK_NEWDIRBLK(wk)); if (!LIST_EMPTY(&newblk->nb_newdirblk)) panic("free_newblk: extra newdirblk"); while ((indirdep = LIST_FIRST(&newblk->nb_indirdeps)) != NULL) indirdep_complete(indirdep); handle_jwork(&newblk->nb_jwork); WORKITEM_FREE(newblk, D_NEWBLK); } /* * Free a newdirblk. Clear the NEWBLOCK flag on its associated pagedep. * This routine must be called with splbio interrupts blocked. */ static void free_newdirblk(newdirblk) struct newdirblk *newdirblk; { struct pagedep *pagedep; struct diradd *dap; struct worklist *wk; LOCK_OWNED(VFSTOUFS(newdirblk->db_list.wk_mp)); WORKLIST_REMOVE(&newdirblk->db_list); /* * If the pagedep is still linked onto the directory buffer * dependency chain, then some of the entries on the * pd_pendinghd list may not be committed to disk yet. In * this case, we will simply clear the NEWBLOCK flag and * let the pd_pendinghd list be processed when the pagedep * is next written. If the pagedep is no longer on the buffer * dependency chain, then all the entries on the pd_pending * list are committed to disk and we can free them here. */ pagedep = newdirblk->db_pagedep; pagedep->pd_state &= ~NEWBLOCK; if ((pagedep->pd_state & ONWORKLIST) == 0) { while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != NULL) free_diradd(dap, NULL); /* * If no dependencies remain, the pagedep will be freed. */ free_pagedep(pagedep); } /* Should only ever be one item in the list. */ while ((wk = LIST_FIRST(&newdirblk->db_mkdir)) != NULL) { WORKLIST_REMOVE(wk); handle_written_mkdir(WK_MKDIR(wk), MKDIR_BODY); } WORKITEM_FREE(newdirblk, D_NEWDIRBLK); } /* * Prepare an inode to be freed. The actual free operation is not * done until the zero'ed inode has been written to disk. */ void softdep_freefile(pvp, ino, mode) struct vnode *pvp; ino_t ino; int mode; { struct inode *ip = VTOI(pvp); struct inodedep *inodedep; struct freefile *freefile; struct freeblks *freeblks; struct ufsmount *ump; ump = ITOUMP(ip); KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(ump)) != 0, ("softdep_freefile called on non-softdep filesystem")); /* * This sets up the inode de-allocation dependency. */ freefile = malloc(sizeof(struct freefile), M_FREEFILE, M_SOFTDEP_FLAGS); workitem_alloc(&freefile->fx_list, D_FREEFILE, pvp->v_mount); freefile->fx_mode = mode; freefile->fx_oldinum = ino; freefile->fx_devvp = ump->um_devvp; LIST_INIT(&freefile->fx_jwork); UFS_LOCK(ump); ump->um_fs->fs_pendinginodes += 1; UFS_UNLOCK(ump); /* * If the inodedep does not exist, then the zero'ed inode has * been written to disk. If the allocated inode has never been * written to disk, then the on-disk inode is zero'ed. In either * case we can free the file immediately. If the journal was * canceled before being written the inode will never make it to * disk and we must send the canceled journal entrys to * ffs_freefile() to be cleared in conjunction with the bitmap. * Any blocks waiting on the inode to write can be safely freed * here as it will never been written. */ ACQUIRE_LOCK(ump); inodedep_lookup(pvp->v_mount, ino, 0, &inodedep); if (inodedep) { /* * Clear out freeblks that no longer need to reference * this inode. */ while ((freeblks = TAILQ_FIRST(&inodedep->id_freeblklst)) != NULL) { TAILQ_REMOVE(&inodedep->id_freeblklst, freeblks, fb_next); freeblks->fb_state &= ~ONDEPLIST; } /* * Remove this inode from the unlinked list. */ if (inodedep->id_state & UNLINKED) { /* * Save the journal work to be freed with the bitmap * before we clear UNLINKED. Otherwise it can be lost * if the inode block is written. */ handle_bufwait(inodedep, &freefile->fx_jwork); clear_unlinked_inodedep(inodedep); /* * Re-acquire inodedep as we've dropped the * per-filesystem lock in clear_unlinked_inodedep(). */ inodedep_lookup(pvp->v_mount, ino, 0, &inodedep); } } if (inodedep == NULL || check_inode_unwritten(inodedep)) { FREE_LOCK(ump); handle_workitem_freefile(freefile); return; } if ((inodedep->id_state & DEPCOMPLETE) == 0) inodedep->id_state |= GOINGAWAY; WORKLIST_INSERT(&inodedep->id_inowait, &freefile->fx_list); FREE_LOCK(ump); if (ip->i_number == ino) ip->i_flag |= IN_MODIFIED; } /* * Check to see if an inode has never been written to disk. If * so free the inodedep and return success, otherwise return failure. * This routine must be called with splbio interrupts blocked. * * If we still have a bitmap dependency, then the inode has never * been written to disk. Drop the dependency as it is no longer * necessary since the inode is being deallocated. We set the * ALLCOMPLETE flags since the bitmap now properly shows that the * inode is not allocated. Even if the inode is actively being * written, it has been rolled back to its zero'ed state, so we * are ensured that a zero inode is what is on the disk. For short * lived files, this change will usually result in removing all the * dependencies from the inode so that it can be freed immediately. */ static int check_inode_unwritten(inodedep) struct inodedep *inodedep; { LOCK_OWNED(VFSTOUFS(inodedep->id_list.wk_mp)); if ((inodedep->id_state & (DEPCOMPLETE | UNLINKED)) != 0 || !LIST_EMPTY(&inodedep->id_dirremhd) || !LIST_EMPTY(&inodedep->id_pendinghd) || !LIST_EMPTY(&inodedep->id_bufwait) || !LIST_EMPTY(&inodedep->id_inowait) || !TAILQ_EMPTY(&inodedep->id_inoreflst) || !TAILQ_EMPTY(&inodedep->id_inoupdt) || !TAILQ_EMPTY(&inodedep->id_newinoupdt) || !TAILQ_EMPTY(&inodedep->id_extupdt) || !TAILQ_EMPTY(&inodedep->id_newextupdt) || !TAILQ_EMPTY(&inodedep->id_freeblklst) || inodedep->id_mkdiradd != NULL || inodedep->id_nlinkdelta != 0) return (0); /* * Another process might be in initiate_write_inodeblock_ufs[12] * trying to allocate memory without holding "Softdep Lock". */ if ((inodedep->id_state & IOSTARTED) != 0 && inodedep->id_savedino1 == NULL) return (0); if (inodedep->id_state & ONDEPLIST) LIST_REMOVE(inodedep, id_deps); inodedep->id_state &= ~ONDEPLIST; inodedep->id_state |= ALLCOMPLETE; inodedep->id_bmsafemap = NULL; if (inodedep->id_state & ONWORKLIST) WORKLIST_REMOVE(&inodedep->id_list); if (inodedep->id_savedino1 != NULL) { free(inodedep->id_savedino1, M_SAVEDINO); inodedep->id_savedino1 = NULL; } if (free_inodedep(inodedep) == 0) panic("check_inode_unwritten: busy inode"); return (1); } static int check_inodedep_free(inodedep) struct inodedep *inodedep; { LOCK_OWNED(VFSTOUFS(inodedep->id_list.wk_mp)); if ((inodedep->id_state & ALLCOMPLETE) != ALLCOMPLETE || !LIST_EMPTY(&inodedep->id_dirremhd) || !LIST_EMPTY(&inodedep->id_pendinghd) || !LIST_EMPTY(&inodedep->id_bufwait) || !LIST_EMPTY(&inodedep->id_inowait) || !TAILQ_EMPTY(&inodedep->id_inoreflst) || !TAILQ_EMPTY(&inodedep->id_inoupdt) || !TAILQ_EMPTY(&inodedep->id_newinoupdt) || !TAILQ_EMPTY(&inodedep->id_extupdt) || !TAILQ_EMPTY(&inodedep->id_newextupdt) || !TAILQ_EMPTY(&inodedep->id_freeblklst) || inodedep->id_mkdiradd != NULL || inodedep->id_nlinkdelta != 0 || inodedep->id_savedino1 != NULL) return (0); return (1); } /* * Try to free an inodedep structure. Return 1 if it could be freed. */ static int free_inodedep(inodedep) struct inodedep *inodedep; { LOCK_OWNED(VFSTOUFS(inodedep->id_list.wk_mp)); if ((inodedep->id_state & (ONWORKLIST | UNLINKED)) != 0 || !check_inodedep_free(inodedep)) return (0); if (inodedep->id_state & ONDEPLIST) LIST_REMOVE(inodedep, id_deps); LIST_REMOVE(inodedep, id_hash); WORKITEM_FREE(inodedep, D_INODEDEP); return (1); } /* * Free the block referenced by a freework structure. The parent freeblks * structure is released and completed when the final cg bitmap reaches * the disk. This routine may be freeing a jnewblk which never made it to * disk in which case we do not have to wait as the operation is undone * in memory immediately. */ static void freework_freeblock(freework) struct freework *freework; { struct freeblks *freeblks; struct jnewblk *jnewblk; struct ufsmount *ump; struct workhead wkhd; struct fs *fs; int bsize; int needj; ump = VFSTOUFS(freework->fw_list.wk_mp); LOCK_OWNED(ump); /* * Handle partial truncate separately. */ if (freework->fw_indir) { complete_trunc_indir(freework); return; } freeblks = freework->fw_freeblks; fs = ump->um_fs; needj = MOUNTEDSUJ(freeblks->fb_list.wk_mp) != 0; bsize = lfragtosize(fs, freework->fw_frags); LIST_INIT(&wkhd); /* * DEPCOMPLETE is cleared in indirblk_insert() if the block lives * on the indirblk hashtable and prevents premature freeing. */ freework->fw_state |= DEPCOMPLETE; /* * SUJ needs to wait for the segment referencing freed indirect * blocks to expire so that we know the checker will not confuse * a re-allocated indirect block with its old contents. */ - if (needj && freework->fw_lbn <= -NDADDR) + if (needj && freework->fw_lbn <= -UFS_NDADDR) indirblk_insert(freework); /* * If we are canceling an existing jnewblk pass it to the free * routine, otherwise pass the freeblk which will ultimately * release the freeblks. If we're not journaling, we can just * free the freeblks immediately. */ jnewblk = freework->fw_jnewblk; if (jnewblk != NULL) { cancel_jnewblk(jnewblk, &wkhd); needj = 0; } else if (needj) { freework->fw_state |= DELAYEDFREE; freeblks->fb_cgwait++; WORKLIST_INSERT(&wkhd, &freework->fw_list); } FREE_LOCK(ump); freeblks_free(ump, freeblks, btodb(bsize)); CTR4(KTR_SUJ, "freework_freeblock: ino %d blkno %jd lbn %jd size %ld", freeblks->fb_inum, freework->fw_blkno, freework->fw_lbn, bsize); ffs_blkfree(ump, fs, freeblks->fb_devvp, freework->fw_blkno, bsize, freeblks->fb_inum, freeblks->fb_vtype, &wkhd); ACQUIRE_LOCK(ump); /* * The jnewblk will be discarded and the bits in the map never * made it to disk. We can immediately free the freeblk. */ if (needj == 0) handle_written_freework(freework); } /* * We enqueue freework items that need processing back on the freeblks and * add the freeblks to the worklist. This makes it easier to find all work * required to flush a truncation in process_truncates(). */ static void freework_enqueue(freework) struct freework *freework; { struct freeblks *freeblks; freeblks = freework->fw_freeblks; if ((freework->fw_state & INPROGRESS) == 0) WORKLIST_INSERT(&freeblks->fb_freeworkhd, &freework->fw_list); if ((freeblks->fb_state & (ONWORKLIST | INPROGRESS | ALLCOMPLETE)) == ALLCOMPLETE && LIST_EMPTY(&freeblks->fb_jblkdephd)) add_to_worklist(&freeblks->fb_list, WK_NODELAY); } /* * Start, continue, or finish the process of freeing an indirect block tree. * The free operation may be paused at any point with fw_off containing the * offset to restart from. This enables us to implement some flow control * for large truncates which may fan out and generate a huge number of * dependencies. */ static void handle_workitem_indirblk(freework) struct freework *freework; { struct freeblks *freeblks; struct ufsmount *ump; struct fs *fs; freeblks = freework->fw_freeblks; ump = VFSTOUFS(freeblks->fb_list.wk_mp); fs = ump->um_fs; if (freework->fw_state & DEPCOMPLETE) { handle_written_freework(freework); return; } if (freework->fw_off == NINDIR(fs)) { freework_freeblock(freework); return; } freework->fw_state |= INPROGRESS; FREE_LOCK(ump); indir_trunc(freework, fsbtodb(fs, freework->fw_blkno), freework->fw_lbn); ACQUIRE_LOCK(ump); } /* * Called when a freework structure attached to a cg buf is written. The * ref on either the parent or the freeblks structure is released and * the freeblks is added back to the worklist if there is more work to do. */ static void handle_written_freework(freework) struct freework *freework; { struct freeblks *freeblks; struct freework *parent; freeblks = freework->fw_freeblks; parent = freework->fw_parent; if (freework->fw_state & DELAYEDFREE) freeblks->fb_cgwait--; freework->fw_state |= COMPLETE; if ((freework->fw_state & ALLCOMPLETE) == ALLCOMPLETE) WORKITEM_FREE(freework, D_FREEWORK); if (parent) { if (--parent->fw_ref == 0) freework_enqueue(parent); return; } if (--freeblks->fb_ref != 0) return; if ((freeblks->fb_state & (ALLCOMPLETE | ONWORKLIST | INPROGRESS)) == ALLCOMPLETE && LIST_EMPTY(&freeblks->fb_jblkdephd)) add_to_worklist(&freeblks->fb_list, WK_NODELAY); } /* * This workitem routine performs the block de-allocation. * The workitem is added to the pending list after the updated * inode block has been written to disk. As mentioned above, * checks regarding the number of blocks de-allocated (compared * to the number of blocks allocated for the file) are also * performed in this function. */ static int handle_workitem_freeblocks(freeblks, flags) struct freeblks *freeblks; int flags; { struct freework *freework; struct newblk *newblk; struct allocindir *aip; struct ufsmount *ump; struct worklist *wk; KASSERT(LIST_EMPTY(&freeblks->fb_jblkdephd), ("handle_workitem_freeblocks: Journal entries not written.")); ump = VFSTOUFS(freeblks->fb_list.wk_mp); ACQUIRE_LOCK(ump); while ((wk = LIST_FIRST(&freeblks->fb_freeworkhd)) != NULL) { WORKLIST_REMOVE(wk); switch (wk->wk_type) { case D_DIRREM: wk->wk_state |= COMPLETE; add_to_worklist(wk, 0); continue; case D_ALLOCDIRECT: free_newblk(WK_NEWBLK(wk)); continue; case D_ALLOCINDIR: aip = WK_ALLOCINDIR(wk); freework = NULL; if (aip->ai_state & DELAYEDFREE) { FREE_LOCK(ump); freework = newfreework(ump, freeblks, NULL, aip->ai_lbn, aip->ai_newblkno, ump->um_fs->fs_frag, 0, 0); ACQUIRE_LOCK(ump); } newblk = WK_NEWBLK(wk); if (newblk->nb_jnewblk) { freework->fw_jnewblk = newblk->nb_jnewblk; newblk->nb_jnewblk->jn_dep = &freework->fw_list; newblk->nb_jnewblk = NULL; } free_newblk(newblk); continue; case D_FREEWORK: freework = WK_FREEWORK(wk); - if (freework->fw_lbn <= -NDADDR) + if (freework->fw_lbn <= -UFS_NDADDR) handle_workitem_indirblk(freework); else freework_freeblock(freework); continue; default: panic("handle_workitem_freeblocks: Unknown type %s", TYPENAME(wk->wk_type)); } } if (freeblks->fb_ref != 0) { freeblks->fb_state &= ~INPROGRESS; wake_worklist(&freeblks->fb_list); freeblks = NULL; } FREE_LOCK(ump); if (freeblks) return handle_complete_freeblocks(freeblks, flags); return (0); } /* * Handle completion of block free via truncate. This allows fs_pending * to track the actual free block count more closely than if we only updated * it at the end. We must be careful to handle cases where the block count * on free was incorrect. */ static void freeblks_free(ump, freeblks, blocks) struct ufsmount *ump; struct freeblks *freeblks; int blocks; { struct fs *fs; ufs2_daddr_t remain; UFS_LOCK(ump); remain = -freeblks->fb_chkcnt; freeblks->fb_chkcnt += blocks; if (remain > 0) { if (remain < blocks) blocks = remain; fs = ump->um_fs; fs->fs_pendingblocks -= blocks; } UFS_UNLOCK(ump); } /* * Once all of the freework workitems are complete we can retire the * freeblocks dependency and any journal work awaiting completion. This * can not be called until all other dependencies are stable on disk. */ static int handle_complete_freeblocks(freeblks, flags) struct freeblks *freeblks; int flags; { struct inodedep *inodedep; struct inode *ip; struct vnode *vp; struct fs *fs; struct ufsmount *ump; ufs2_daddr_t spare; ump = VFSTOUFS(freeblks->fb_list.wk_mp); fs = ump->um_fs; flags = LK_EXCLUSIVE | flags; spare = freeblks->fb_chkcnt; /* * If we did not release the expected number of blocks we may have * to adjust the inode block count here. Only do so if it wasn't * a truncation to zero and the modrev still matches. */ if (spare && freeblks->fb_len != 0) { if (ffs_vgetf(freeblks->fb_list.wk_mp, freeblks->fb_inum, flags, &vp, FFSV_FORCEINSMQ) != 0) return (EBUSY); ip = VTOI(vp); if (DIP(ip, i_modrev) == freeblks->fb_modrev) { DIP_SET(ip, i_blocks, DIP(ip, i_blocks) - spare); ip->i_flag |= IN_CHANGE; /* * We must wait so this happens before the * journal is reclaimed. */ ffs_update(vp, 1); } vput(vp); } if (spare < 0) { UFS_LOCK(ump); fs->fs_pendingblocks += spare; UFS_UNLOCK(ump); } #ifdef QUOTA /* Handle spare. */ if (spare) quotaadj(freeblks->fb_quota, ump, -spare); quotarele(freeblks->fb_quota); #endif ACQUIRE_LOCK(ump); if (freeblks->fb_state & ONDEPLIST) { inodedep_lookup(freeblks->fb_list.wk_mp, freeblks->fb_inum, 0, &inodedep); TAILQ_REMOVE(&inodedep->id_freeblklst, freeblks, fb_next); freeblks->fb_state &= ~ONDEPLIST; if (TAILQ_EMPTY(&inodedep->id_freeblklst)) free_inodedep(inodedep); } /* * All of the freeblock deps must be complete prior to this call * so it's now safe to complete earlier outstanding journal entries. */ handle_jwork(&freeblks->fb_jwork); WORKITEM_FREE(freeblks, D_FREEBLKS); FREE_LOCK(ump); return (0); } /* * Release blocks associated with the freeblks and stored in the indirect * block dbn. If level is greater than SINGLE, the block is an indirect block * and recursive calls to indirtrunc must be used to cleanse other indirect * blocks. * * This handles partial and complete truncation of blocks. Partial is noted * with goingaway == 0. In this case the freework is completed after the * zero'd indirects are written to disk. For full truncation the freework * is completed after the block is freed. */ static void indir_trunc(freework, dbn, lbn) struct freework *freework; ufs2_daddr_t dbn; ufs_lbn_t lbn; { struct freework *nfreework; struct workhead wkhd; struct freeblks *freeblks; struct buf *bp; struct fs *fs; struct indirdep *indirdep; struct ufsmount *ump; ufs1_daddr_t *bap1; ufs2_daddr_t nb, nnb, *bap2; ufs_lbn_t lbnadd, nlbn; int i, nblocks, ufs1fmt; int freedblocks; int goingaway; int freedeps; int needj; int level; int cnt; freeblks = freework->fw_freeblks; ump = VFSTOUFS(freeblks->fb_list.wk_mp); fs = ump->um_fs; /* * Get buffer of block pointers to be freed. There are three cases: * * 1) Partial truncate caches the indirdep pointer in the freework * which provides us a back copy to the save bp which holds the * pointers we want to clear. When this completes the zero * pointers are written to the real copy. * 2) The indirect is being completely truncated, cancel_indirdep() * eliminated the real copy and placed the indirdep on the saved * copy. The indirdep and buf are discarded when this completes. * 3) The indirect was not in memory, we read a copy off of the disk * using the devvp and drop and invalidate the buffer when we're * done. */ goingaway = 1; indirdep = NULL; if (freework->fw_indir != NULL) { goingaway = 0; indirdep = freework->fw_indir; bp = indirdep->ir_savebp; if (bp == NULL || bp->b_blkno != dbn) panic("indir_trunc: Bad saved buf %p blkno %jd", bp, (intmax_t)dbn); } else if ((bp = incore(&freeblks->fb_devvp->v_bufobj, dbn)) != NULL) { /* * The lock prevents the buf dep list from changing and * indirects on devvp should only ever have one dependency. */ indirdep = WK_INDIRDEP(LIST_FIRST(&bp->b_dep)); if (indirdep == NULL || (indirdep->ir_state & GOINGAWAY) == 0) panic("indir_trunc: Bad indirdep %p from buf %p", indirdep, bp); } else if (bread(freeblks->fb_devvp, dbn, (int)fs->fs_bsize, NOCRED, &bp) != 0) { brelse(bp); return; } ACQUIRE_LOCK(ump); /* Protects against a race with complete_trunc_indir(). */ freework->fw_state &= ~INPROGRESS; /* * If we have an indirdep we need to enforce the truncation order * and discard it when it is complete. */ if (indirdep) { if (freework != TAILQ_FIRST(&indirdep->ir_trunc) && !TAILQ_EMPTY(&indirdep->ir_trunc)) { /* * Add the complete truncate to the list on the * indirdep to enforce in-order processing. */ if (freework->fw_indir == NULL) TAILQ_INSERT_TAIL(&indirdep->ir_trunc, freework, fw_next); FREE_LOCK(ump); return; } /* * If we're goingaway, free the indirdep. Otherwise it will * linger until the write completes. */ if (goingaway) free_indirdep(indirdep); } FREE_LOCK(ump); /* Initialize pointers depending on block size. */ if (ump->um_fstype == UFS1) { bap1 = (ufs1_daddr_t *)bp->b_data; nb = bap1[freework->fw_off]; ufs1fmt = 1; bap2 = NULL; } else { bap2 = (ufs2_daddr_t *)bp->b_data; nb = bap2[freework->fw_off]; ufs1fmt = 0; bap1 = NULL; } level = lbn_level(lbn); needj = MOUNTEDSUJ(UFSTOVFS(ump)) != 0; lbnadd = lbn_offset(fs, level); nblocks = btodb(fs->fs_bsize); nfreework = freework; freedeps = 0; cnt = 0; /* * Reclaim blocks. Traverses into nested indirect levels and * arranges for the current level to be freed when subordinates * are free when journaling. */ for (i = freework->fw_off; i < NINDIR(fs); i++, nb = nnb) { if (i != NINDIR(fs) - 1) { if (ufs1fmt) nnb = bap1[i+1]; else nnb = bap2[i+1]; } else nnb = 0; if (nb == 0) continue; cnt++; if (level != 0) { nlbn = (lbn + 1) - (i * lbnadd); if (needj != 0) { nfreework = newfreework(ump, freeblks, freework, nlbn, nb, fs->fs_frag, 0, 0); freedeps++; } indir_trunc(nfreework, fsbtodb(fs, nb), nlbn); } else { struct freedep *freedep; /* * Attempt to aggregate freedep dependencies for * all blocks being released to the same CG. */ LIST_INIT(&wkhd); if (needj != 0 && (nnb == 0 || (dtog(fs, nb) != dtog(fs, nnb)))) { freedep = newfreedep(freework); WORKLIST_INSERT_UNLOCKED(&wkhd, &freedep->fd_list); freedeps++; } CTR3(KTR_SUJ, "indir_trunc: ino %d blkno %jd size %ld", freeblks->fb_inum, nb, fs->fs_bsize); ffs_blkfree(ump, fs, freeblks->fb_devvp, nb, fs->fs_bsize, freeblks->fb_inum, freeblks->fb_vtype, &wkhd); } } if (goingaway) { bp->b_flags |= B_INVAL | B_NOCACHE; brelse(bp); } freedblocks = 0; if (level == 0) freedblocks = (nblocks * cnt); if (needj == 0) freedblocks += nblocks; freeblks_free(ump, freeblks, freedblocks); /* * If we are journaling set up the ref counts and offset so this * indirect can be completed when its children are free. */ if (needj) { ACQUIRE_LOCK(ump); freework->fw_off = i; freework->fw_ref += freedeps; freework->fw_ref -= NINDIR(fs) + 1; if (level == 0) freeblks->fb_cgwait += freedeps; if (freework->fw_ref == 0) freework_freeblock(freework); FREE_LOCK(ump); return; } /* * If we're not journaling we can free the indirect now. */ dbn = dbtofsb(fs, dbn); CTR3(KTR_SUJ, "indir_trunc 2: ino %d blkno %jd size %ld", freeblks->fb_inum, dbn, fs->fs_bsize); ffs_blkfree(ump, fs, freeblks->fb_devvp, dbn, fs->fs_bsize, freeblks->fb_inum, freeblks->fb_vtype, NULL); /* Non SUJ softdep does single-threaded truncations. */ if (freework->fw_blkno == dbn) { freework->fw_state |= ALLCOMPLETE; ACQUIRE_LOCK(ump); handle_written_freework(freework); FREE_LOCK(ump); } return; } /* * Cancel an allocindir when it is removed via truncation. When bp is not * NULL the indirect never appeared on disk and is scheduled to be freed * independently of the indir so we can more easily track journal work. */ static void cancel_allocindir(aip, bp, freeblks, trunc) struct allocindir *aip; struct buf *bp; struct freeblks *freeblks; int trunc; { struct indirdep *indirdep; struct freefrag *freefrag; struct newblk *newblk; newblk = (struct newblk *)aip; LIST_REMOVE(aip, ai_next); /* * We must eliminate the pointer in bp if it must be freed on its * own due to partial truncate or pending journal work. */ if (bp && (trunc || newblk->nb_jnewblk)) { /* * Clear the pointer and mark the aip to be freed * directly if it never existed on disk. */ aip->ai_state |= DELAYEDFREE; indirdep = aip->ai_indirdep; if (indirdep->ir_state & UFS1FMT) ((ufs1_daddr_t *)bp->b_data)[aip->ai_offset] = 0; else ((ufs2_daddr_t *)bp->b_data)[aip->ai_offset] = 0; } /* * When truncating the previous pointer will be freed via * savedbp. Eliminate the freefrag which would dup free. */ if (trunc && (freefrag = newblk->nb_freefrag) != NULL) { newblk->nb_freefrag = NULL; if (freefrag->ff_jdep) cancel_jfreefrag( WK_JFREEFRAG(freefrag->ff_jdep)); jwork_move(&freeblks->fb_jwork, &freefrag->ff_jwork); WORKITEM_FREE(freefrag, D_FREEFRAG); } /* * If the journal hasn't been written the jnewblk must be passed * to the call to ffs_blkfree that reclaims the space. We accomplish * this by leaving the journal dependency on the newblk to be freed * when a freework is created in handle_workitem_freeblocks(). */ cancel_newblk(newblk, NULL, &freeblks->fb_jwork); WORKLIST_INSERT(&freeblks->fb_freeworkhd, &newblk->nb_list); } /* * Create the mkdir dependencies for . and .. in a new directory. Link them * in to a newdirblk so any subsequent additions are tracked properly. The * caller is responsible for adding the mkdir1 dependency to the journal * and updating id_mkdiradd. This function returns with the per-filesystem * lock held. */ static struct mkdir * setup_newdir(dap, newinum, dinum, newdirbp, mkdirp) struct diradd *dap; ino_t newinum; ino_t dinum; struct buf *newdirbp; struct mkdir **mkdirp; { struct newblk *newblk; struct pagedep *pagedep; struct inodedep *inodedep; struct newdirblk *newdirblk; struct mkdir *mkdir1, *mkdir2; struct worklist *wk; struct jaddref *jaddref; struct ufsmount *ump; struct mount *mp; mp = dap->da_list.wk_mp; ump = VFSTOUFS(mp); newdirblk = malloc(sizeof(struct newdirblk), M_NEWDIRBLK, M_SOFTDEP_FLAGS); workitem_alloc(&newdirblk->db_list, D_NEWDIRBLK, mp); LIST_INIT(&newdirblk->db_mkdir); mkdir1 = malloc(sizeof(struct mkdir), M_MKDIR, M_SOFTDEP_FLAGS); workitem_alloc(&mkdir1->md_list, D_MKDIR, mp); mkdir1->md_state = ATTACHED | MKDIR_BODY; mkdir1->md_diradd = dap; mkdir1->md_jaddref = NULL; mkdir2 = malloc(sizeof(struct mkdir), M_MKDIR, M_SOFTDEP_FLAGS); workitem_alloc(&mkdir2->md_list, D_MKDIR, mp); mkdir2->md_state = ATTACHED | MKDIR_PARENT; mkdir2->md_diradd = dap; mkdir2->md_jaddref = NULL; if (MOUNTEDSUJ(mp) == 0) { mkdir1->md_state |= DEPCOMPLETE; mkdir2->md_state |= DEPCOMPLETE; } /* * Dependency on "." and ".." being written to disk. */ mkdir1->md_buf = newdirbp; ACQUIRE_LOCK(VFSTOUFS(mp)); LIST_INSERT_HEAD(&ump->softdep_mkdirlisthd, mkdir1, md_mkdirs); /* * We must link the pagedep, allocdirect, and newdirblk for * the initial file page so the pointer to the new directory * is not written until the directory contents are live and * any subsequent additions are not marked live until the * block is reachable via the inode. */ if (pagedep_lookup(mp, newdirbp, newinum, 0, 0, &pagedep) == 0) panic("setup_newdir: lost pagedep"); LIST_FOREACH(wk, &newdirbp->b_dep, wk_list) if (wk->wk_type == D_ALLOCDIRECT) break; if (wk == NULL) panic("setup_newdir: lost allocdirect"); if (pagedep->pd_state & NEWBLOCK) panic("setup_newdir: NEWBLOCK already set"); newblk = WK_NEWBLK(wk); pagedep->pd_state |= NEWBLOCK; pagedep->pd_newdirblk = newdirblk; newdirblk->db_pagedep = pagedep; WORKLIST_INSERT(&newblk->nb_newdirblk, &newdirblk->db_list); WORKLIST_INSERT(&newdirblk->db_mkdir, &mkdir1->md_list); /* * Look up the inodedep for the parent directory so that we * can link mkdir2 into the pending dotdot jaddref or * the inode write if there is none. If the inode is * ALLCOMPLETE and no jaddref is present all dependencies have * been satisfied and mkdir2 can be freed. */ inodedep_lookup(mp, dinum, 0, &inodedep); if (MOUNTEDSUJ(mp)) { if (inodedep == NULL) panic("setup_newdir: Lost parent."); jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst, inoreflst); KASSERT(jaddref != NULL && jaddref->ja_parent == newinum && (jaddref->ja_state & MKDIR_PARENT), ("setup_newdir: bad dotdot jaddref %p", jaddref)); LIST_INSERT_HEAD(&ump->softdep_mkdirlisthd, mkdir2, md_mkdirs); mkdir2->md_jaddref = jaddref; jaddref->ja_mkdir = mkdir2; } else if (inodedep == NULL || (inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) { dap->da_state &= ~MKDIR_PARENT; WORKITEM_FREE(mkdir2, D_MKDIR); mkdir2 = NULL; } else { LIST_INSERT_HEAD(&ump->softdep_mkdirlisthd, mkdir2, md_mkdirs); WORKLIST_INSERT(&inodedep->id_bufwait, &mkdir2->md_list); } *mkdirp = mkdir2; return (mkdir1); } /* * Directory entry addition dependencies. * * When adding a new directory entry, the inode (with its incremented link * count) must be written to disk before the directory entry's pointer to it. * Also, if the inode is newly allocated, the corresponding freemap must be * updated (on disk) before the directory entry's pointer. These requirements * are met via undo/redo on the directory entry's pointer, which consists * simply of the inode number. * * As directory entries are added and deleted, the free space within a * directory block can become fragmented. The ufs filesystem will compact * a fragmented directory block to make space for a new entry. When this * occurs, the offsets of previously added entries change. Any "diradd" * dependency structures corresponding to these entries must be updated with * the new offsets. */ /* * This routine is called after the in-memory inode's link * count has been incremented, but before the directory entry's * pointer to the inode has been set. */ int softdep_setup_directory_add(bp, dp, diroffset, newinum, newdirbp, isnewblk) struct buf *bp; /* buffer containing directory block */ struct inode *dp; /* inode for directory */ off_t diroffset; /* offset of new entry in directory */ ino_t newinum; /* inode referenced by new directory entry */ struct buf *newdirbp; /* non-NULL => contents of new mkdir */ int isnewblk; /* entry is in a newly allocated block */ { int offset; /* offset of new entry within directory block */ ufs_lbn_t lbn; /* block in directory containing new entry */ struct fs *fs; struct diradd *dap; struct newblk *newblk; struct pagedep *pagedep; struct inodedep *inodedep; struct newdirblk *newdirblk; struct mkdir *mkdir1, *mkdir2; struct jaddref *jaddref; struct ufsmount *ump; struct mount *mp; int isindir; mp = ITOVFS(dp); ump = VFSTOUFS(mp); KASSERT(MOUNTEDSOFTDEP(mp) != 0, ("softdep_setup_directory_add called on non-softdep filesystem")); /* * Whiteouts have no dependencies. */ - if (newinum == WINO) { + if (newinum == UFS_WINO) { if (newdirbp != NULL) bdwrite(newdirbp); return (0); } jaddref = NULL; mkdir1 = mkdir2 = NULL; fs = ump->um_fs; lbn = lblkno(fs, diroffset); offset = blkoff(fs, diroffset); dap = malloc(sizeof(struct diradd), M_DIRADD, M_SOFTDEP_FLAGS|M_ZERO); workitem_alloc(&dap->da_list, D_DIRADD, mp); dap->da_offset = offset; dap->da_newinum = newinum; dap->da_state = ATTACHED; LIST_INIT(&dap->da_jwork); - isindir = bp->b_lblkno >= NDADDR; + isindir = bp->b_lblkno >= UFS_NDADDR; newdirblk = NULL; if (isnewblk && (isindir ? blkoff(fs, diroffset) : fragoff(fs, diroffset)) == 0) { newdirblk = malloc(sizeof(struct newdirblk), M_NEWDIRBLK, M_SOFTDEP_FLAGS); workitem_alloc(&newdirblk->db_list, D_NEWDIRBLK, mp); LIST_INIT(&newdirblk->db_mkdir); } /* * If we're creating a new directory setup the dependencies and set * the dap state to wait for them. Otherwise it's COMPLETE and * we can move on. */ if (newdirbp == NULL) { dap->da_state |= DEPCOMPLETE; ACQUIRE_LOCK(ump); } else { dap->da_state |= MKDIR_BODY | MKDIR_PARENT; mkdir1 = setup_newdir(dap, newinum, dp->i_number, newdirbp, &mkdir2); } /* * Link into parent directory pagedep to await its being written. */ pagedep_lookup(mp, bp, dp->i_number, lbn, DEPALLOC, &pagedep); #ifdef DEBUG if (diradd_lookup(pagedep, offset) != NULL) panic("softdep_setup_directory_add: %p already at off %d\n", diradd_lookup(pagedep, offset), offset); #endif dap->da_pagedep = pagedep; LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)], dap, da_pdlist); inodedep_lookup(mp, newinum, DEPALLOC, &inodedep); /* * If we're journaling, link the diradd into the jaddref so it * may be completed after the journal entry is written. Otherwise, * link the diradd into its inodedep. If the inode is not yet * written place it on the bufwait list, otherwise do the post-inode * write processing to put it on the id_pendinghd list. */ if (MOUNTEDSUJ(mp)) { jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst, inoreflst); KASSERT(jaddref != NULL && jaddref->ja_parent == dp->i_number, ("softdep_setup_directory_add: bad jaddref %p", jaddref)); jaddref->ja_diroff = diroffset; jaddref->ja_diradd = dap; add_to_journal(&jaddref->ja_list); } else if ((inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) diradd_inode_written(dap, inodedep); else WORKLIST_INSERT(&inodedep->id_bufwait, &dap->da_list); /* * Add the journal entries for . and .. links now that the primary * link is written. */ if (mkdir1 != NULL && MOUNTEDSUJ(mp)) { jaddref = (struct jaddref *)TAILQ_PREV(&jaddref->ja_ref, inoreflst, if_deps); KASSERT(jaddref != NULL && jaddref->ja_ino == jaddref->ja_parent && (jaddref->ja_state & MKDIR_BODY), ("softdep_setup_directory_add: bad dot jaddref %p", jaddref)); mkdir1->md_jaddref = jaddref; jaddref->ja_mkdir = mkdir1; /* * It is important that the dotdot journal entry * is added prior to the dot entry since dot writes * both the dot and dotdot links. These both must * be added after the primary link for the journal * to remain consistent. */ add_to_journal(&mkdir2->md_jaddref->ja_list); add_to_journal(&jaddref->ja_list); } /* * If we are adding a new directory remember this diradd so that if * we rename it we can keep the dot and dotdot dependencies. If * we are adding a new name for an inode that has a mkdiradd we * must be in rename and we have to move the dot and dotdot * dependencies to this new name. The old name is being orphaned * soon. */ if (mkdir1 != NULL) { if (inodedep->id_mkdiradd != NULL) panic("softdep_setup_directory_add: Existing mkdir"); inodedep->id_mkdiradd = dap; } else if (inodedep->id_mkdiradd) merge_diradd(inodedep, dap); if (newdirblk != NULL) { /* * There is nothing to do if we are already tracking * this block. */ if ((pagedep->pd_state & NEWBLOCK) != 0) { WORKITEM_FREE(newdirblk, D_NEWDIRBLK); FREE_LOCK(ump); return (0); } if (newblk_lookup(mp, dbtofsb(fs, bp->b_blkno), 0, &newblk) == 0) panic("softdep_setup_directory_add: lost entry"); WORKLIST_INSERT(&newblk->nb_newdirblk, &newdirblk->db_list); pagedep->pd_state |= NEWBLOCK; pagedep->pd_newdirblk = newdirblk; newdirblk->db_pagedep = pagedep; FREE_LOCK(ump); /* * If we extended into an indirect signal direnter to sync. */ if (isindir) return (1); return (0); } FREE_LOCK(ump); return (0); } /* * This procedure is called to change the offset of a directory * entry when compacting a directory block which must be owned * exclusively by the caller. Note that the actual entry movement * must be done in this procedure to ensure that no I/O completions * occur while the move is in progress. */ void softdep_change_directoryentry_offset(bp, dp, base, oldloc, newloc, entrysize) struct buf *bp; /* Buffer holding directory block. */ struct inode *dp; /* inode for directory */ caddr_t base; /* address of dp->i_offset */ caddr_t oldloc; /* address of old directory location */ caddr_t newloc; /* address of new directory location */ int entrysize; /* size of directory entry */ { int offset, oldoffset, newoffset; struct pagedep *pagedep; struct jmvref *jmvref; struct diradd *dap; struct direct *de; struct mount *mp; struct ufsmount *ump; ufs_lbn_t lbn; int flags; mp = ITOVFS(dp); ump = VFSTOUFS(mp); KASSERT(MOUNTEDSOFTDEP(mp) != 0, ("softdep_change_directoryentry_offset called on " "non-softdep filesystem")); de = (struct direct *)oldloc; jmvref = NULL; flags = 0; /* * Moves are always journaled as it would be too complex to * determine if any affected adds or removes are present in the * journal. */ if (MOUNTEDSUJ(mp)) { flags = DEPALLOC; jmvref = newjmvref(dp, de->d_ino, dp->i_offset + (oldloc - base), dp->i_offset + (newloc - base)); } lbn = lblkno(ump->um_fs, dp->i_offset); offset = blkoff(ump->um_fs, dp->i_offset); oldoffset = offset + (oldloc - base); newoffset = offset + (newloc - base); ACQUIRE_LOCK(ump); if (pagedep_lookup(mp, bp, dp->i_number, lbn, flags, &pagedep) == 0) goto done; dap = diradd_lookup(pagedep, oldoffset); if (dap) { dap->da_offset = newoffset; newoffset = DIRADDHASH(newoffset); oldoffset = DIRADDHASH(oldoffset); if ((dap->da_state & ALLCOMPLETE) != ALLCOMPLETE && newoffset != oldoffset) { LIST_REMOVE(dap, da_pdlist); LIST_INSERT_HEAD(&pagedep->pd_diraddhd[newoffset], dap, da_pdlist); } } done: if (jmvref) { jmvref->jm_pagedep = pagedep; LIST_INSERT_HEAD(&pagedep->pd_jmvrefhd, jmvref, jm_deps); add_to_journal(&jmvref->jm_list); } bcopy(oldloc, newloc, entrysize); FREE_LOCK(ump); } /* * Move the mkdir dependencies and journal work from one diradd to another * when renaming a directory. The new name must depend on the mkdir deps * completing as the old name did. Directories can only have one valid link * at a time so one must be canonical. */ static void merge_diradd(inodedep, newdap) struct inodedep *inodedep; struct diradd *newdap; { struct diradd *olddap; struct mkdir *mkdir, *nextmd; struct ufsmount *ump; short state; olddap = inodedep->id_mkdiradd; inodedep->id_mkdiradd = newdap; if ((olddap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) { newdap->da_state &= ~DEPCOMPLETE; ump = VFSTOUFS(inodedep->id_list.wk_mp); for (mkdir = LIST_FIRST(&ump->softdep_mkdirlisthd); mkdir; mkdir = nextmd) { nextmd = LIST_NEXT(mkdir, md_mkdirs); if (mkdir->md_diradd != olddap) continue; mkdir->md_diradd = newdap; state = mkdir->md_state & (MKDIR_PARENT | MKDIR_BODY); newdap->da_state |= state; olddap->da_state &= ~state; if ((olddap->da_state & (MKDIR_PARENT | MKDIR_BODY)) == 0) break; } if ((olddap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) panic("merge_diradd: unfound ref"); } /* * Any mkdir related journal items are not safe to be freed until * the new name is stable. */ jwork_move(&newdap->da_jwork, &olddap->da_jwork); olddap->da_state |= DEPCOMPLETE; complete_diradd(olddap); } /* * Move the diradd to the pending list when all diradd dependencies are * complete. */ static void complete_diradd(dap) struct diradd *dap; { struct pagedep *pagedep; if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) { if (dap->da_state & DIRCHG) pagedep = dap->da_previous->dm_pagedep; else pagedep = dap->da_pagedep; LIST_REMOVE(dap, da_pdlist); LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist); } } /* * Cancel a diradd when a dirrem overlaps with it. We must cancel the journal * add entries and conditonally journal the remove. */ static void cancel_diradd(dap, dirrem, jremref, dotremref, dotdotremref) struct diradd *dap; struct dirrem *dirrem; struct jremref *jremref; struct jremref *dotremref; struct jremref *dotdotremref; { struct inodedep *inodedep; struct jaddref *jaddref; struct inoref *inoref; struct ufsmount *ump; struct mkdir *mkdir; /* * If no remove references were allocated we're on a non-journaled * filesystem and can skip the cancel step. */ if (jremref == NULL) { free_diradd(dap, NULL); return; } /* * Cancel the primary name an free it if it does not require * journaling. */ if (inodedep_lookup(dap->da_list.wk_mp, dap->da_newinum, 0, &inodedep) != 0) { /* Abort the addref that reference this diradd. */ TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) { if (inoref->if_list.wk_type != D_JADDREF) continue; jaddref = (struct jaddref *)inoref; if (jaddref->ja_diradd != dap) continue; if (cancel_jaddref(jaddref, inodedep, &dirrem->dm_jwork) == 0) { free_jremref(jremref); jremref = NULL; } break; } } /* * Cancel subordinate names and free them if they do not require * journaling. */ if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) { ump = VFSTOUFS(dap->da_list.wk_mp); LIST_FOREACH(mkdir, &ump->softdep_mkdirlisthd, md_mkdirs) { if (mkdir->md_diradd != dap) continue; if ((jaddref = mkdir->md_jaddref) == NULL) continue; mkdir->md_jaddref = NULL; if (mkdir->md_state & MKDIR_PARENT) { if (cancel_jaddref(jaddref, NULL, &dirrem->dm_jwork) == 0) { free_jremref(dotdotremref); dotdotremref = NULL; } } else { if (cancel_jaddref(jaddref, inodedep, &dirrem->dm_jwork) == 0) { free_jremref(dotremref); dotremref = NULL; } } } } if (jremref) journal_jremref(dirrem, jremref, inodedep); if (dotremref) journal_jremref(dirrem, dotremref, inodedep); if (dotdotremref) journal_jremref(dirrem, dotdotremref, NULL); jwork_move(&dirrem->dm_jwork, &dap->da_jwork); free_diradd(dap, &dirrem->dm_jwork); } /* * Free a diradd dependency structure. This routine must be called * with splbio interrupts blocked. */ static void free_diradd(dap, wkhd) struct diradd *dap; struct workhead *wkhd; { struct dirrem *dirrem; struct pagedep *pagedep; struct inodedep *inodedep; struct mkdir *mkdir, *nextmd; struct ufsmount *ump; ump = VFSTOUFS(dap->da_list.wk_mp); LOCK_OWNED(ump); LIST_REMOVE(dap, da_pdlist); if (dap->da_state & ONWORKLIST) WORKLIST_REMOVE(&dap->da_list); if ((dap->da_state & DIRCHG) == 0) { pagedep = dap->da_pagedep; } else { dirrem = dap->da_previous; pagedep = dirrem->dm_pagedep; dirrem->dm_dirinum = pagedep->pd_ino; dirrem->dm_state |= COMPLETE; if (LIST_EMPTY(&dirrem->dm_jremrefhd)) add_to_worklist(&dirrem->dm_list, 0); } if (inodedep_lookup(pagedep->pd_list.wk_mp, dap->da_newinum, 0, &inodedep) != 0) if (inodedep->id_mkdiradd == dap) inodedep->id_mkdiradd = NULL; if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) { for (mkdir = LIST_FIRST(&ump->softdep_mkdirlisthd); mkdir; mkdir = nextmd) { nextmd = LIST_NEXT(mkdir, md_mkdirs); if (mkdir->md_diradd != dap) continue; dap->da_state &= ~(mkdir->md_state & (MKDIR_PARENT | MKDIR_BODY)); LIST_REMOVE(mkdir, md_mkdirs); if (mkdir->md_state & ONWORKLIST) WORKLIST_REMOVE(&mkdir->md_list); if (mkdir->md_jaddref != NULL) panic("free_diradd: Unexpected jaddref"); WORKITEM_FREE(mkdir, D_MKDIR); if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) == 0) break; } if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) panic("free_diradd: unfound ref"); } if (inodedep) free_inodedep(inodedep); /* * Free any journal segments waiting for the directory write. */ handle_jwork(&dap->da_jwork); WORKITEM_FREE(dap, D_DIRADD); } /* * Directory entry removal dependencies. * * When removing a directory entry, the entry's inode pointer must be * zero'ed on disk before the corresponding inode's link count is decremented * (possibly freeing the inode for re-use). This dependency is handled by * updating the directory entry but delaying the inode count reduction until * after the directory block has been written to disk. After this point, the * inode count can be decremented whenever it is convenient. */ /* * This routine should be called immediately after removing * a directory entry. The inode's link count should not be * decremented by the calling procedure -- the soft updates * code will do this task when it is safe. */ void softdep_setup_remove(bp, dp, ip, isrmdir) struct buf *bp; /* buffer containing directory block */ struct inode *dp; /* inode for the directory being modified */ struct inode *ip; /* inode for directory entry being removed */ int isrmdir; /* indicates if doing RMDIR */ { struct dirrem *dirrem, *prevdirrem; struct inodedep *inodedep; struct ufsmount *ump; int direct; ump = ITOUMP(ip); KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(ump)) != 0, ("softdep_setup_remove called on non-softdep filesystem")); /* * Allocate a new dirrem if appropriate and ACQUIRE_LOCK. We want * newdirrem() to setup the full directory remove which requires * isrmdir > 1. */ dirrem = newdirrem(bp, dp, ip, isrmdir, &prevdirrem); /* * Add the dirrem to the inodedep's pending remove list for quick * discovery later. */ if (inodedep_lookup(UFSTOVFS(ump), ip->i_number, 0, &inodedep) == 0) panic("softdep_setup_remove: Lost inodedep."); KASSERT((inodedep->id_state & UNLINKED) == 0, ("inode unlinked")); dirrem->dm_state |= ONDEPLIST; LIST_INSERT_HEAD(&inodedep->id_dirremhd, dirrem, dm_inonext); /* * If the COMPLETE flag is clear, then there were no active * entries and we want to roll back to a zeroed entry until * the new inode is committed to disk. If the COMPLETE flag is * set then we have deleted an entry that never made it to * disk. If the entry we deleted resulted from a name change, * then the old name still resides on disk. We cannot delete * its inode (returned to us in prevdirrem) until the zeroed * directory entry gets to disk. The new inode has never been * referenced on the disk, so can be deleted immediately. */ if ((dirrem->dm_state & COMPLETE) == 0) { LIST_INSERT_HEAD(&dirrem->dm_pagedep->pd_dirremhd, dirrem, dm_next); FREE_LOCK(ump); } else { if (prevdirrem != NULL) LIST_INSERT_HEAD(&dirrem->dm_pagedep->pd_dirremhd, prevdirrem, dm_next); dirrem->dm_dirinum = dirrem->dm_pagedep->pd_ino; direct = LIST_EMPTY(&dirrem->dm_jremrefhd); FREE_LOCK(ump); if (direct) handle_workitem_remove(dirrem, 0); } } /* * Check for an entry matching 'offset' on both the pd_dirraddhd list and the * pd_pendinghd list of a pagedep. */ static struct diradd * diradd_lookup(pagedep, offset) struct pagedep *pagedep; int offset; { struct diradd *dap; LIST_FOREACH(dap, &pagedep->pd_diraddhd[DIRADDHASH(offset)], da_pdlist) if (dap->da_offset == offset) return (dap); LIST_FOREACH(dap, &pagedep->pd_pendinghd, da_pdlist) if (dap->da_offset == offset) return (dap); return (NULL); } /* * Search for a .. diradd dependency in a directory that is being removed. * If the directory was renamed to a new parent we have a diradd rather * than a mkdir for the .. entry. We need to cancel it now before * it is found in truncate(). */ static struct jremref * cancel_diradd_dotdot(ip, dirrem, jremref) struct inode *ip; struct dirrem *dirrem; struct jremref *jremref; { struct pagedep *pagedep; struct diradd *dap; struct worklist *wk; if (pagedep_lookup(ITOVFS(ip), NULL, ip->i_number, 0, 0, &pagedep) == 0) return (jremref); dap = diradd_lookup(pagedep, DOTDOT_OFFSET); if (dap == NULL) return (jremref); cancel_diradd(dap, dirrem, jremref, NULL, NULL); /* * Mark any journal work as belonging to the parent so it is freed * with the .. reference. */ LIST_FOREACH(wk, &dirrem->dm_jwork, wk_list) wk->wk_state |= MKDIR_PARENT; return (NULL); } /* * Cancel the MKDIR_PARENT mkdir component of a diradd when we're going to * replace it with a dirrem/diradd pair as a result of re-parenting a * directory. This ensures that we don't simultaneously have a mkdir and * a diradd for the same .. entry. */ static struct jremref * cancel_mkdir_dotdot(ip, dirrem, jremref) struct inode *ip; struct dirrem *dirrem; struct jremref *jremref; { struct inodedep *inodedep; struct jaddref *jaddref; struct ufsmount *ump; struct mkdir *mkdir; struct diradd *dap; struct mount *mp; mp = ITOVFS(ip); if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0) return (jremref); dap = inodedep->id_mkdiradd; if (dap == NULL || (dap->da_state & MKDIR_PARENT) == 0) return (jremref); ump = VFSTOUFS(inodedep->id_list.wk_mp); for (mkdir = LIST_FIRST(&ump->softdep_mkdirlisthd); mkdir; mkdir = LIST_NEXT(mkdir, md_mkdirs)) if (mkdir->md_diradd == dap && mkdir->md_state & MKDIR_PARENT) break; if (mkdir == NULL) panic("cancel_mkdir_dotdot: Unable to find mkdir\n"); if ((jaddref = mkdir->md_jaddref) != NULL) { mkdir->md_jaddref = NULL; jaddref->ja_state &= ~MKDIR_PARENT; if (inodedep_lookup(mp, jaddref->ja_ino, 0, &inodedep) == 0) panic("cancel_mkdir_dotdot: Lost parent inodedep"); if (cancel_jaddref(jaddref, inodedep, &dirrem->dm_jwork)) { journal_jremref(dirrem, jremref, inodedep); jremref = NULL; } } if (mkdir->md_state & ONWORKLIST) WORKLIST_REMOVE(&mkdir->md_list); mkdir->md_state |= ALLCOMPLETE; complete_mkdir(mkdir); return (jremref); } static void journal_jremref(dirrem, jremref, inodedep) struct dirrem *dirrem; struct jremref *jremref; struct inodedep *inodedep; { if (inodedep == NULL) if (inodedep_lookup(jremref->jr_list.wk_mp, jremref->jr_ref.if_ino, 0, &inodedep) == 0) panic("journal_jremref: Lost inodedep"); LIST_INSERT_HEAD(&dirrem->dm_jremrefhd, jremref, jr_deps); TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, &jremref->jr_ref, if_deps); add_to_journal(&jremref->jr_list); } static void dirrem_journal(dirrem, jremref, dotremref, dotdotremref) struct dirrem *dirrem; struct jremref *jremref; struct jremref *dotremref; struct jremref *dotdotremref; { struct inodedep *inodedep; if (inodedep_lookup(jremref->jr_list.wk_mp, jremref->jr_ref.if_ino, 0, &inodedep) == 0) panic("dirrem_journal: Lost inodedep"); journal_jremref(dirrem, jremref, inodedep); if (dotremref) journal_jremref(dirrem, dotremref, inodedep); if (dotdotremref) journal_jremref(dirrem, dotdotremref, NULL); } /* * Allocate a new dirrem if appropriate and return it along with * its associated pagedep. Called without a lock, returns with lock. */ static struct dirrem * newdirrem(bp, dp, ip, isrmdir, prevdirremp) struct buf *bp; /* buffer containing directory block */ struct inode *dp; /* inode for the directory being modified */ struct inode *ip; /* inode for directory entry being removed */ int isrmdir; /* indicates if doing RMDIR */ struct dirrem **prevdirremp; /* previously referenced inode, if any */ { int offset; ufs_lbn_t lbn; struct diradd *dap; struct dirrem *dirrem; struct pagedep *pagedep; struct jremref *jremref; struct jremref *dotremref; struct jremref *dotdotremref; struct vnode *dvp; struct ufsmount *ump; /* * Whiteouts have no deletion dependencies. */ if (ip == NULL) panic("newdirrem: whiteout"); dvp = ITOV(dp); ump = ITOUMP(dp); /* * If the system is over its limit and our filesystem is * responsible for more than our share of that usage and * we are not a snapshot, request some inodedep cleanup. * Limiting the number of dirrem structures will also limit * the number of freefile and freeblks structures. */ ACQUIRE_LOCK(ump); if (!IS_SNAPSHOT(ip) && softdep_excess_items(ump, D_DIRREM)) schedule_cleanup(UFSTOVFS(ump)); else FREE_LOCK(ump); dirrem = malloc(sizeof(struct dirrem), M_DIRREM, M_SOFTDEP_FLAGS | M_ZERO); workitem_alloc(&dirrem->dm_list, D_DIRREM, dvp->v_mount); LIST_INIT(&dirrem->dm_jremrefhd); LIST_INIT(&dirrem->dm_jwork); dirrem->dm_state = isrmdir ? RMDIR : 0; dirrem->dm_oldinum = ip->i_number; *prevdirremp = NULL; /* * Allocate remove reference structures to track journal write * dependencies. We will always have one for the link and * when doing directories we will always have one more for dot. * When renaming a directory we skip the dotdot link change so * this is not needed. */ jremref = dotremref = dotdotremref = NULL; if (DOINGSUJ(dvp)) { if (isrmdir) { jremref = newjremref(dirrem, dp, ip, dp->i_offset, ip->i_effnlink + 2); dotremref = newjremref(dirrem, ip, ip, DOT_OFFSET, ip->i_effnlink + 1); dotdotremref = newjremref(dirrem, ip, dp, DOTDOT_OFFSET, dp->i_effnlink + 1); dotdotremref->jr_state |= MKDIR_PARENT; } else jremref = newjremref(dirrem, dp, ip, dp->i_offset, ip->i_effnlink + 1); } ACQUIRE_LOCK(ump); lbn = lblkno(ump->um_fs, dp->i_offset); offset = blkoff(ump->um_fs, dp->i_offset); pagedep_lookup(UFSTOVFS(ump), bp, dp->i_number, lbn, DEPALLOC, &pagedep); dirrem->dm_pagedep = pagedep; dirrem->dm_offset = offset; /* * If we're renaming a .. link to a new directory, cancel any * existing MKDIR_PARENT mkdir. If it has already been canceled * the jremref is preserved for any potential diradd in this * location. This can not coincide with a rmdir. */ if (dp->i_offset == DOTDOT_OFFSET) { if (isrmdir) panic("newdirrem: .. directory change during remove?"); jremref = cancel_mkdir_dotdot(dp, dirrem, jremref); } /* * If we're removing a directory search for the .. dependency now and * cancel it. Any pending journal work will be added to the dirrem * to be completed when the workitem remove completes. */ if (isrmdir) dotdotremref = cancel_diradd_dotdot(ip, dirrem, dotdotremref); /* * Check for a diradd dependency for the same directory entry. * If present, then both dependencies become obsolete and can * be de-allocated. */ dap = diradd_lookup(pagedep, offset); if (dap == NULL) { /* * Link the jremref structures into the dirrem so they are * written prior to the pagedep. */ if (jremref) dirrem_journal(dirrem, jremref, dotremref, dotdotremref); return (dirrem); } /* * Must be ATTACHED at this point. */ if ((dap->da_state & ATTACHED) == 0) panic("newdirrem: not ATTACHED"); if (dap->da_newinum != ip->i_number) panic("newdirrem: inum %ju should be %ju", (uintmax_t)ip->i_number, (uintmax_t)dap->da_newinum); /* * If we are deleting a changed name that never made it to disk, * then return the dirrem describing the previous inode (which * represents the inode currently referenced from this entry on disk). */ if ((dap->da_state & DIRCHG) != 0) { *prevdirremp = dap->da_previous; dap->da_state &= ~DIRCHG; dap->da_pagedep = pagedep; } /* * We are deleting an entry that never made it to disk. * Mark it COMPLETE so we can delete its inode immediately. */ dirrem->dm_state |= COMPLETE; cancel_diradd(dap, dirrem, jremref, dotremref, dotdotremref); #ifdef SUJ_DEBUG if (isrmdir == 0) { struct worklist *wk; LIST_FOREACH(wk, &dirrem->dm_jwork, wk_list) if (wk->wk_state & (MKDIR_BODY | MKDIR_PARENT)) panic("bad wk %p (0x%X)\n", wk, wk->wk_state); } #endif return (dirrem); } /* * Directory entry change dependencies. * * Changing an existing directory entry requires that an add operation * be completed first followed by a deletion. The semantics for the addition * are identical to the description of adding a new entry above except * that the rollback is to the old inode number rather than zero. Once * the addition dependency is completed, the removal is done as described * in the removal routine above. */ /* * This routine should be called immediately after changing * a directory entry. The inode's link count should not be * decremented by the calling procedure -- the soft updates * code will perform this task when it is safe. */ void softdep_setup_directory_change(bp, dp, ip, newinum, isrmdir) struct buf *bp; /* buffer containing directory block */ struct inode *dp; /* inode for the directory being modified */ struct inode *ip; /* inode for directory entry being removed */ ino_t newinum; /* new inode number for changed entry */ int isrmdir; /* indicates if doing RMDIR */ { int offset; struct diradd *dap = NULL; struct dirrem *dirrem, *prevdirrem; struct pagedep *pagedep; struct inodedep *inodedep; struct jaddref *jaddref; struct mount *mp; struct ufsmount *ump; mp = ITOVFS(dp); ump = VFSTOUFS(mp); offset = blkoff(ump->um_fs, dp->i_offset); KASSERT(MOUNTEDSOFTDEP(mp) != 0, ("softdep_setup_directory_change called on non-softdep filesystem")); /* * Whiteouts do not need diradd dependencies. */ - if (newinum != WINO) { + if (newinum != UFS_WINO) { dap = malloc(sizeof(struct diradd), M_DIRADD, M_SOFTDEP_FLAGS|M_ZERO); workitem_alloc(&dap->da_list, D_DIRADD, mp); dap->da_state = DIRCHG | ATTACHED | DEPCOMPLETE; dap->da_offset = offset; dap->da_newinum = newinum; LIST_INIT(&dap->da_jwork); } /* * Allocate a new dirrem and ACQUIRE_LOCK. */ dirrem = newdirrem(bp, dp, ip, isrmdir, &prevdirrem); pagedep = dirrem->dm_pagedep; /* * The possible values for isrmdir: * 0 - non-directory file rename * 1 - directory rename within same directory * inum - directory rename to new directory of given inode number * When renaming to a new directory, we are both deleting and * creating a new directory entry, so the link count on the new * directory should not change. Thus we do not need the followup * dirrem which is usually done in handle_workitem_remove. We set * the DIRCHG flag to tell handle_workitem_remove to skip the * followup dirrem. */ if (isrmdir > 1) dirrem->dm_state |= DIRCHG; /* * Whiteouts have no additional dependencies, * so just put the dirrem on the correct list. */ - if (newinum == WINO) { + if (newinum == UFS_WINO) { if ((dirrem->dm_state & COMPLETE) == 0) { LIST_INSERT_HEAD(&pagedep->pd_dirremhd, dirrem, dm_next); } else { dirrem->dm_dirinum = pagedep->pd_ino; if (LIST_EMPTY(&dirrem->dm_jremrefhd)) add_to_worklist(&dirrem->dm_list, 0); } FREE_LOCK(ump); return; } /* * Add the dirrem to the inodedep's pending remove list for quick * discovery later. A valid nlinkdelta ensures that this lookup * will not fail. */ if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0) panic("softdep_setup_directory_change: Lost inodedep."); dirrem->dm_state |= ONDEPLIST; LIST_INSERT_HEAD(&inodedep->id_dirremhd, dirrem, dm_inonext); /* * If the COMPLETE flag is clear, then there were no active * entries and we want to roll back to the previous inode until * the new inode is committed to disk. If the COMPLETE flag is * set, then we have deleted an entry that never made it to disk. * If the entry we deleted resulted from a name change, then the old * inode reference still resides on disk. Any rollback that we do * needs to be to that old inode (returned to us in prevdirrem). If * the entry we deleted resulted from a create, then there is * no entry on the disk, so we want to roll back to zero rather * than the uncommitted inode. In either of the COMPLETE cases we * want to immediately free the unwritten and unreferenced inode. */ if ((dirrem->dm_state & COMPLETE) == 0) { dap->da_previous = dirrem; } else { if (prevdirrem != NULL) { dap->da_previous = prevdirrem; } else { dap->da_state &= ~DIRCHG; dap->da_pagedep = pagedep; } dirrem->dm_dirinum = pagedep->pd_ino; if (LIST_EMPTY(&dirrem->dm_jremrefhd)) add_to_worklist(&dirrem->dm_list, 0); } /* * Lookup the jaddref for this journal entry. We must finish * initializing it and make the diradd write dependent on it. * If we're not journaling, put it on the id_bufwait list if the * inode is not yet written. If it is written, do the post-inode * write processing to put it on the id_pendinghd list. */ inodedep_lookup(mp, newinum, DEPALLOC, &inodedep); if (MOUNTEDSUJ(mp)) { jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst, inoreflst); KASSERT(jaddref != NULL && jaddref->ja_parent == dp->i_number, ("softdep_setup_directory_change: bad jaddref %p", jaddref)); jaddref->ja_diroff = dp->i_offset; jaddref->ja_diradd = dap; LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)], dap, da_pdlist); add_to_journal(&jaddref->ja_list); } else if ((inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) { dap->da_state |= COMPLETE; LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist); WORKLIST_INSERT(&inodedep->id_pendinghd, &dap->da_list); } else { LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)], dap, da_pdlist); WORKLIST_INSERT(&inodedep->id_bufwait, &dap->da_list); } /* * If we're making a new name for a directory that has not been * committed when need to move the dot and dotdot references to * this new name. */ if (inodedep->id_mkdiradd && dp->i_offset != DOTDOT_OFFSET) merge_diradd(inodedep, dap); FREE_LOCK(ump); } /* * Called whenever the link count on an inode is changed. * It creates an inode dependency so that the new reference(s) * to the inode cannot be committed to disk until the updated * inode has been written. */ void softdep_change_linkcnt(ip) struct inode *ip; /* the inode with the increased link count */ { struct inodedep *inodedep; struct ufsmount *ump; ump = ITOUMP(ip); KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(ump)) != 0, ("softdep_change_linkcnt called on non-softdep filesystem")); ACQUIRE_LOCK(ump); inodedep_lookup(UFSTOVFS(ump), ip->i_number, DEPALLOC, &inodedep); if (ip->i_nlink < ip->i_effnlink) panic("softdep_change_linkcnt: bad delta"); inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink; FREE_LOCK(ump); } /* * Attach a sbdep dependency to the superblock buf so that we can keep * track of the head of the linked list of referenced but unlinked inodes. */ void softdep_setup_sbupdate(ump, fs, bp) struct ufsmount *ump; struct fs *fs; struct buf *bp; { struct sbdep *sbdep; struct worklist *wk; KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(ump)) != 0, ("softdep_setup_sbupdate called on non-softdep filesystem")); LIST_FOREACH(wk, &bp->b_dep, wk_list) if (wk->wk_type == D_SBDEP) break; if (wk != NULL) return; sbdep = malloc(sizeof(struct sbdep), M_SBDEP, M_SOFTDEP_FLAGS); workitem_alloc(&sbdep->sb_list, D_SBDEP, UFSTOVFS(ump)); sbdep->sb_fs = fs; sbdep->sb_ump = ump; ACQUIRE_LOCK(ump); WORKLIST_INSERT(&bp->b_dep, &sbdep->sb_list); FREE_LOCK(ump); } /* * Return the first unlinked inodedep which is ready to be the head of the * list. The inodedep and all those after it must have valid next pointers. */ static struct inodedep * first_unlinked_inodedep(ump) struct ufsmount *ump; { struct inodedep *inodedep; struct inodedep *idp; LOCK_OWNED(ump); for (inodedep = TAILQ_LAST(&ump->softdep_unlinked, inodedeplst); inodedep; inodedep = idp) { if ((inodedep->id_state & UNLINKNEXT) == 0) return (NULL); idp = TAILQ_PREV(inodedep, inodedeplst, id_unlinked); if (idp == NULL || (idp->id_state & UNLINKNEXT) == 0) break; if ((inodedep->id_state & UNLINKPREV) == 0) break; } return (inodedep); } /* * Set the sujfree unlinked head pointer prior to writing a superblock. */ static void initiate_write_sbdep(sbdep) struct sbdep *sbdep; { struct inodedep *inodedep; struct fs *bpfs; struct fs *fs; bpfs = sbdep->sb_fs; fs = sbdep->sb_ump->um_fs; inodedep = first_unlinked_inodedep(sbdep->sb_ump); if (inodedep) { fs->fs_sujfree = inodedep->id_ino; inodedep->id_state |= UNLINKPREV; } else fs->fs_sujfree = 0; bpfs->fs_sujfree = fs->fs_sujfree; } /* * After a superblock is written determine whether it must be written again * due to a changing unlinked list head. */ static int handle_written_sbdep(sbdep, bp) struct sbdep *sbdep; struct buf *bp; { struct inodedep *inodedep; struct fs *fs; LOCK_OWNED(sbdep->sb_ump); fs = sbdep->sb_fs; /* * If the superblock doesn't match the in-memory list start over. */ inodedep = first_unlinked_inodedep(sbdep->sb_ump); if ((inodedep && fs->fs_sujfree != inodedep->id_ino) || (inodedep == NULL && fs->fs_sujfree != 0)) { bdirty(bp); return (1); } WORKITEM_FREE(sbdep, D_SBDEP); if (fs->fs_sujfree == 0) return (0); /* * Now that we have a record of this inode in stable store allow it * to be written to free up pending work. Inodes may see a lot of * write activity after they are unlinked which we must not hold up. */ for (; inodedep != NULL; inodedep = TAILQ_NEXT(inodedep, id_unlinked)) { if ((inodedep->id_state & UNLINKLINKS) != UNLINKLINKS) panic("handle_written_sbdep: Bad inodedep %p (0x%X)", inodedep, inodedep->id_state); if (inodedep->id_state & UNLINKONLIST) break; inodedep->id_state |= DEPCOMPLETE | UNLINKONLIST; } return (0); } /* * Mark an inodedep as unlinked and insert it into the in-memory unlinked list. */ static void unlinked_inodedep(mp, inodedep) struct mount *mp; struct inodedep *inodedep; { struct ufsmount *ump; ump = VFSTOUFS(mp); LOCK_OWNED(ump); if (MOUNTEDSUJ(mp) == 0) return; ump->um_fs->fs_fmod = 1; if (inodedep->id_state & UNLINKED) panic("unlinked_inodedep: %p already unlinked\n", inodedep); inodedep->id_state |= UNLINKED; TAILQ_INSERT_HEAD(&ump->softdep_unlinked, inodedep, id_unlinked); } /* * Remove an inodedep from the unlinked inodedep list. This may require * disk writes if the inode has made it that far. */ static void clear_unlinked_inodedep(inodedep) struct inodedep *inodedep; { struct ufsmount *ump; struct inodedep *idp; struct inodedep *idn; struct fs *fs; struct buf *bp; ino_t ino; ino_t nino; ino_t pino; int error; ump = VFSTOUFS(inodedep->id_list.wk_mp); fs = ump->um_fs; ino = inodedep->id_ino; error = 0; for (;;) { LOCK_OWNED(ump); KASSERT((inodedep->id_state & UNLINKED) != 0, ("clear_unlinked_inodedep: inodedep %p not unlinked", inodedep)); /* * If nothing has yet been written simply remove us from * the in memory list and return. This is the most common * case where handle_workitem_remove() loses the final * reference. */ if ((inodedep->id_state & UNLINKLINKS) == 0) break; /* * If we have a NEXT pointer and no PREV pointer we can simply * clear NEXT's PREV and remove ourselves from the list. Be * careful not to clear PREV if the superblock points at * next as well. */ idn = TAILQ_NEXT(inodedep, id_unlinked); if ((inodedep->id_state & UNLINKLINKS) == UNLINKNEXT) { if (idn && fs->fs_sujfree != idn->id_ino) idn->id_state &= ~UNLINKPREV; break; } /* * Here we have an inodedep which is actually linked into * the list. We must remove it by forcing a write to the * link before us, whether it be the superblock or an inode. * Unfortunately the list may change while we're waiting * on the buf lock for either resource so we must loop until * we lock the right one. If both the superblock and an * inode point to this inode we must clear the inode first * followed by the superblock. */ idp = TAILQ_PREV(inodedep, inodedeplst, id_unlinked); pino = 0; if (idp && (idp->id_state & UNLINKNEXT)) pino = idp->id_ino; FREE_LOCK(ump); if (pino == 0) { bp = getblk(ump->um_devvp, btodb(fs->fs_sblockloc), (int)fs->fs_sbsize, 0, 0, 0); } else { error = bread(ump->um_devvp, fsbtodb(fs, ino_to_fsba(fs, pino)), (int)fs->fs_bsize, NOCRED, &bp); if (error) brelse(bp); } ACQUIRE_LOCK(ump); if (error) break; /* If the list has changed restart the loop. */ idp = TAILQ_PREV(inodedep, inodedeplst, id_unlinked); nino = 0; if (idp && (idp->id_state & UNLINKNEXT)) nino = idp->id_ino; if (nino != pino || (inodedep->id_state & UNLINKPREV) != UNLINKPREV) { FREE_LOCK(ump); brelse(bp); ACQUIRE_LOCK(ump); continue; } nino = 0; idn = TAILQ_NEXT(inodedep, id_unlinked); if (idn) nino = idn->id_ino; /* * Remove us from the in memory list. After this we cannot * access the inodedep. */ KASSERT((inodedep->id_state & UNLINKED) != 0, ("clear_unlinked_inodedep: inodedep %p not unlinked", inodedep)); inodedep->id_state &= ~(UNLINKED | UNLINKLINKS | UNLINKONLIST); TAILQ_REMOVE(&ump->softdep_unlinked, inodedep, id_unlinked); FREE_LOCK(ump); /* * The predecessor's next pointer is manually updated here * so that the NEXT flag is never cleared for an element * that is in the list. */ if (pino == 0) { bcopy((caddr_t)fs, bp->b_data, (u_int)fs->fs_sbsize); ffs_oldfscompat_write((struct fs *)bp->b_data, ump); softdep_setup_sbupdate(ump, (struct fs *)bp->b_data, bp); } else if (fs->fs_magic == FS_UFS1_MAGIC) ((struct ufs1_dinode *)bp->b_data + ino_to_fsbo(fs, pino))->di_freelink = nino; else ((struct ufs2_dinode *)bp->b_data + ino_to_fsbo(fs, pino))->di_freelink = nino; /* * If the bwrite fails we have no recourse to recover. The * filesystem is corrupted already. */ bwrite(bp); ACQUIRE_LOCK(ump); /* * If the superblock pointer still needs to be cleared force * a write here. */ if (fs->fs_sujfree == ino) { FREE_LOCK(ump); bp = getblk(ump->um_devvp, btodb(fs->fs_sblockloc), (int)fs->fs_sbsize, 0, 0, 0); bcopy((caddr_t)fs, bp->b_data, (u_int)fs->fs_sbsize); ffs_oldfscompat_write((struct fs *)bp->b_data, ump); softdep_setup_sbupdate(ump, (struct fs *)bp->b_data, bp); bwrite(bp); ACQUIRE_LOCK(ump); } if (fs->fs_sujfree != ino) return; panic("clear_unlinked_inodedep: Failed to clear free head"); } if (inodedep->id_ino == fs->fs_sujfree) panic("clear_unlinked_inodedep: Freeing head of free list"); inodedep->id_state &= ~(UNLINKED | UNLINKLINKS | UNLINKONLIST); TAILQ_REMOVE(&ump->softdep_unlinked, inodedep, id_unlinked); return; } /* * This workitem decrements the inode's link count. * If the link count reaches zero, the file is removed. */ static int handle_workitem_remove(dirrem, flags) struct dirrem *dirrem; int flags; { struct inodedep *inodedep; struct workhead dotdotwk; struct worklist *wk; struct ufsmount *ump; struct mount *mp; struct vnode *vp; struct inode *ip; ino_t oldinum; if (dirrem->dm_state & ONWORKLIST) panic("handle_workitem_remove: dirrem %p still on worklist", dirrem); oldinum = dirrem->dm_oldinum; mp = dirrem->dm_list.wk_mp; ump = VFSTOUFS(mp); flags |= LK_EXCLUSIVE; if (ffs_vgetf(mp, oldinum, flags, &vp, FFSV_FORCEINSMQ) != 0) return (EBUSY); ip = VTOI(vp); ACQUIRE_LOCK(ump); if ((inodedep_lookup(mp, oldinum, 0, &inodedep)) == 0) panic("handle_workitem_remove: lost inodedep"); if (dirrem->dm_state & ONDEPLIST) LIST_REMOVE(dirrem, dm_inonext); KASSERT(LIST_EMPTY(&dirrem->dm_jremrefhd), ("handle_workitem_remove: Journal entries not written.")); /* * Move all dependencies waiting on the remove to complete * from the dirrem to the inode inowait list to be completed * after the inode has been updated and written to disk. Any * marked MKDIR_PARENT are saved to be completed when the .. ref * is removed. */ LIST_INIT(&dotdotwk); while ((wk = LIST_FIRST(&dirrem->dm_jwork)) != NULL) { WORKLIST_REMOVE(wk); if (wk->wk_state & MKDIR_PARENT) { wk->wk_state &= ~MKDIR_PARENT; WORKLIST_INSERT(&dotdotwk, wk); continue; } WORKLIST_INSERT(&inodedep->id_inowait, wk); } LIST_SWAP(&dirrem->dm_jwork, &dotdotwk, worklist, wk_list); /* * Normal file deletion. */ if ((dirrem->dm_state & RMDIR) == 0) { ip->i_nlink--; DIP_SET(ip, i_nlink, ip->i_nlink); ip->i_flag |= IN_CHANGE; if (ip->i_nlink < ip->i_effnlink) panic("handle_workitem_remove: bad file delta"); if (ip->i_nlink == 0) unlinked_inodedep(mp, inodedep); inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink; KASSERT(LIST_EMPTY(&dirrem->dm_jwork), ("handle_workitem_remove: worklist not empty. %s", TYPENAME(LIST_FIRST(&dirrem->dm_jwork)->wk_type))); WORKITEM_FREE(dirrem, D_DIRREM); FREE_LOCK(ump); goto out; } /* * Directory deletion. Decrement reference count for both the * just deleted parent directory entry and the reference for ".". * Arrange to have the reference count on the parent decremented * to account for the loss of "..". */ ip->i_nlink -= 2; DIP_SET(ip, i_nlink, ip->i_nlink); ip->i_flag |= IN_CHANGE; if (ip->i_nlink < ip->i_effnlink) panic("handle_workitem_remove: bad dir delta"); if (ip->i_nlink == 0) unlinked_inodedep(mp, inodedep); inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink; /* * Rename a directory to a new parent. Since, we are both deleting * and creating a new directory entry, the link count on the new * directory should not change. Thus we skip the followup dirrem. */ if (dirrem->dm_state & DIRCHG) { KASSERT(LIST_EMPTY(&dirrem->dm_jwork), ("handle_workitem_remove: DIRCHG and worklist not empty.")); WORKITEM_FREE(dirrem, D_DIRREM); FREE_LOCK(ump); goto out; } dirrem->dm_state = ONDEPLIST; dirrem->dm_oldinum = dirrem->dm_dirinum; /* * Place the dirrem on the parent's diremhd list. */ if (inodedep_lookup(mp, dirrem->dm_oldinum, 0, &inodedep) == 0) panic("handle_workitem_remove: lost dir inodedep"); LIST_INSERT_HEAD(&inodedep->id_dirremhd, dirrem, dm_inonext); /* * If the allocated inode has never been written to disk, then * the on-disk inode is zero'ed and we can remove the file * immediately. When journaling if the inode has been marked * unlinked and not DEPCOMPLETE we know it can never be written. */ inodedep_lookup(mp, oldinum, 0, &inodedep); if (inodedep == NULL || (inodedep->id_state & (DEPCOMPLETE | UNLINKED)) == UNLINKED || check_inode_unwritten(inodedep)) { FREE_LOCK(ump); vput(vp); return handle_workitem_remove(dirrem, flags); } WORKLIST_INSERT(&inodedep->id_inowait, &dirrem->dm_list); FREE_LOCK(ump); ip->i_flag |= IN_CHANGE; out: ffs_update(vp, 0); vput(vp); return (0); } /* * Inode de-allocation dependencies. * * When an inode's link count is reduced to zero, it can be de-allocated. We * found it convenient to postpone de-allocation until after the inode is * written to disk with its new link count (zero). At this point, all of the * on-disk inode's block pointers are nullified and, with careful dependency * list ordering, all dependencies related to the inode will be satisfied and * the corresponding dependency structures de-allocated. So, if/when the * inode is reused, there will be no mixing of old dependencies with new * ones. This artificial dependency is set up by the block de-allocation * procedure above (softdep_setup_freeblocks) and completed by the * following procedure. */ static void handle_workitem_freefile(freefile) struct freefile *freefile; { struct workhead wkhd; struct fs *fs; struct inodedep *idp; struct ufsmount *ump; int error; ump = VFSTOUFS(freefile->fx_list.wk_mp); fs = ump->um_fs; #ifdef DEBUG ACQUIRE_LOCK(ump); error = inodedep_lookup(UFSTOVFS(ump), freefile->fx_oldinum, 0, &idp); FREE_LOCK(ump); if (error) panic("handle_workitem_freefile: inodedep %p survived", idp); #endif UFS_LOCK(ump); fs->fs_pendinginodes -= 1; UFS_UNLOCK(ump); LIST_INIT(&wkhd); LIST_SWAP(&freefile->fx_jwork, &wkhd, worklist, wk_list); if ((error = ffs_freefile(ump, fs, freefile->fx_devvp, freefile->fx_oldinum, freefile->fx_mode, &wkhd)) != 0) softdep_error("handle_workitem_freefile", error); ACQUIRE_LOCK(ump); WORKITEM_FREE(freefile, D_FREEFILE); FREE_LOCK(ump); } /* * Helper function which unlinks marker element from work list and returns * the next element on the list. */ static __inline struct worklist * markernext(struct worklist *marker) { struct worklist *next; next = LIST_NEXT(marker, wk_list); LIST_REMOVE(marker, wk_list); return next; } /* * Disk writes. * * The dependency structures constructed above are most actively used when file * system blocks are written to disk. No constraints are placed on when a * block can be written, but unsatisfied update dependencies are made safe by * modifying (or replacing) the source memory for the duration of the disk * write. When the disk write completes, the memory block is again brought * up-to-date. * * In-core inode structure reclamation. * * Because there are a finite number of "in-core" inode structures, they are * reused regularly. By transferring all inode-related dependencies to the * in-memory inode block and indexing them separately (via "inodedep"s), we * can allow "in-core" inode structures to be reused at any time and avoid * any increase in contention. * * Called just before entering the device driver to initiate a new disk I/O. * The buffer must be locked, thus, no I/O completion operations can occur * while we are manipulating its associated dependencies. */ static void softdep_disk_io_initiation(bp) struct buf *bp; /* structure describing disk write to occur */ { struct worklist *wk; struct worklist marker; struct inodedep *inodedep; struct freeblks *freeblks; struct jblkdep *jblkdep; struct newblk *newblk; struct ufsmount *ump; /* * We only care about write operations. There should never * be dependencies for reads. */ if (bp->b_iocmd != BIO_WRITE) panic("softdep_disk_io_initiation: not write"); if (bp->b_vflags & BV_BKGRDINPROG) panic("softdep_disk_io_initiation: Writing buffer with " "background write in progress: %p", bp); if ((wk = LIST_FIRST(&bp->b_dep)) == NULL) return; ump = VFSTOUFS(wk->wk_mp); marker.wk_type = D_LAST + 1; /* Not a normal workitem */ PHOLD(curproc); /* Don't swap out kernel stack */ ACQUIRE_LOCK(ump); /* * Do any necessary pre-I/O processing. */ for (wk = LIST_FIRST(&bp->b_dep); wk != NULL; wk = markernext(&marker)) { LIST_INSERT_AFTER(wk, &marker, wk_list); switch (wk->wk_type) { case D_PAGEDEP: initiate_write_filepage(WK_PAGEDEP(wk), bp); continue; case D_INODEDEP: inodedep = WK_INODEDEP(wk); if (inodedep->id_fs->fs_magic == FS_UFS1_MAGIC) initiate_write_inodeblock_ufs1(inodedep, bp); else initiate_write_inodeblock_ufs2(inodedep, bp); continue; case D_INDIRDEP: initiate_write_indirdep(WK_INDIRDEP(wk), bp); continue; case D_BMSAFEMAP: initiate_write_bmsafemap(WK_BMSAFEMAP(wk), bp); continue; case D_JSEG: WK_JSEG(wk)->js_buf = NULL; continue; case D_FREEBLKS: freeblks = WK_FREEBLKS(wk); jblkdep = LIST_FIRST(&freeblks->fb_jblkdephd); /* * We have to wait for the freeblks to be journaled * before we can write an inodeblock with updated * pointers. Be careful to arrange the marker so * we revisit the freeblks if it's not removed by * the first jwait(). */ if (jblkdep != NULL) { LIST_REMOVE(&marker, wk_list); LIST_INSERT_BEFORE(wk, &marker, wk_list); jwait(&jblkdep->jb_list, MNT_WAIT); } continue; case D_ALLOCDIRECT: case D_ALLOCINDIR: /* * We have to wait for the jnewblk to be journaled * before we can write to a block if the contents * may be confused with an earlier file's indirect * at recovery time. Handle the marker as described * above. */ newblk = WK_NEWBLK(wk); if (newblk->nb_jnewblk != NULL && indirblk_lookup(newblk->nb_list.wk_mp, newblk->nb_newblkno)) { LIST_REMOVE(&marker, wk_list); LIST_INSERT_BEFORE(wk, &marker, wk_list); jwait(&newblk->nb_jnewblk->jn_list, MNT_WAIT); } continue; case D_SBDEP: initiate_write_sbdep(WK_SBDEP(wk)); continue; case D_MKDIR: case D_FREEWORK: case D_FREEDEP: case D_JSEGDEP: continue; default: panic("handle_disk_io_initiation: Unexpected type %s", TYPENAME(wk->wk_type)); /* NOTREACHED */ } } FREE_LOCK(ump); PRELE(curproc); /* Allow swapout of kernel stack */ } /* * Called from within the procedure above to deal with unsatisfied * allocation dependencies in a directory. The buffer must be locked, * thus, no I/O completion operations can occur while we are * manipulating its associated dependencies. */ static void initiate_write_filepage(pagedep, bp) struct pagedep *pagedep; struct buf *bp; { struct jremref *jremref; struct jmvref *jmvref; struct dirrem *dirrem; struct diradd *dap; struct direct *ep; int i; if (pagedep->pd_state & IOSTARTED) { /* * This can only happen if there is a driver that does not * understand chaining. Here biodone will reissue the call * to strategy for the incomplete buffers. */ printf("initiate_write_filepage: already started\n"); return; } pagedep->pd_state |= IOSTARTED; /* * Wait for all journal remove dependencies to hit the disk. * We can not allow any potentially conflicting directory adds * to be visible before removes and rollback is too difficult. * The per-filesystem lock may be dropped and re-acquired, however * we hold the buf locked so the dependency can not go away. */ LIST_FOREACH(dirrem, &pagedep->pd_dirremhd, dm_next) while ((jremref = LIST_FIRST(&dirrem->dm_jremrefhd)) != NULL) jwait(&jremref->jr_list, MNT_WAIT); while ((jmvref = LIST_FIRST(&pagedep->pd_jmvrefhd)) != NULL) jwait(&jmvref->jm_list, MNT_WAIT); for (i = 0; i < DAHASHSZ; i++) { LIST_FOREACH(dap, &pagedep->pd_diraddhd[i], da_pdlist) { ep = (struct direct *) ((char *)bp->b_data + dap->da_offset); if (ep->d_ino != dap->da_newinum) panic("%s: dir inum %ju != new %ju", "initiate_write_filepage", (uintmax_t)ep->d_ino, (uintmax_t)dap->da_newinum); if (dap->da_state & DIRCHG) ep->d_ino = dap->da_previous->dm_oldinum; else ep->d_ino = 0; dap->da_state &= ~ATTACHED; dap->da_state |= UNDONE; } } } /* * Version of initiate_write_inodeblock that handles UFS1 dinodes. * Note that any bug fixes made to this routine must be done in the * version found below. * * Called from within the procedure above to deal with unsatisfied * allocation dependencies in an inodeblock. The buffer must be * locked, thus, no I/O completion operations can occur while we * are manipulating its associated dependencies. */ static void initiate_write_inodeblock_ufs1(inodedep, bp) struct inodedep *inodedep; struct buf *bp; /* The inode block */ { struct allocdirect *adp, *lastadp; struct ufs1_dinode *dp; struct ufs1_dinode *sip; struct inoref *inoref; struct ufsmount *ump; struct fs *fs; ufs_lbn_t i; #ifdef INVARIANTS ufs_lbn_t prevlbn = 0; #endif int deplist; if (inodedep->id_state & IOSTARTED) panic("initiate_write_inodeblock_ufs1: already started"); inodedep->id_state |= IOSTARTED; fs = inodedep->id_fs; ump = VFSTOUFS(inodedep->id_list.wk_mp); LOCK_OWNED(ump); dp = (struct ufs1_dinode *)bp->b_data + ino_to_fsbo(fs, inodedep->id_ino); /* * If we're on the unlinked list but have not yet written our * next pointer initialize it here. */ if ((inodedep->id_state & (UNLINKED | UNLINKNEXT)) == UNLINKED) { struct inodedep *inon; inon = TAILQ_NEXT(inodedep, id_unlinked); dp->di_freelink = inon ? inon->id_ino : 0; } /* * If the bitmap is not yet written, then the allocated * inode cannot be written to disk. */ if ((inodedep->id_state & DEPCOMPLETE) == 0) { if (inodedep->id_savedino1 != NULL) panic("initiate_write_inodeblock_ufs1: I/O underway"); FREE_LOCK(ump); sip = malloc(sizeof(struct ufs1_dinode), M_SAVEDINO, M_SOFTDEP_FLAGS); ACQUIRE_LOCK(ump); inodedep->id_savedino1 = sip; *inodedep->id_savedino1 = *dp; bzero((caddr_t)dp, sizeof(struct ufs1_dinode)); dp->di_gen = inodedep->id_savedino1->di_gen; dp->di_freelink = inodedep->id_savedino1->di_freelink; return; } /* * If no dependencies, then there is nothing to roll back. */ inodedep->id_savedsize = dp->di_size; inodedep->id_savedextsize = 0; inodedep->id_savednlink = dp->di_nlink; if (TAILQ_EMPTY(&inodedep->id_inoupdt) && TAILQ_EMPTY(&inodedep->id_inoreflst)) return; /* * Revert the link count to that of the first unwritten journal entry. */ inoref = TAILQ_FIRST(&inodedep->id_inoreflst); if (inoref) dp->di_nlink = inoref->if_nlink; /* * Set the dependencies to busy. */ for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; adp = TAILQ_NEXT(adp, ad_next)) { #ifdef INVARIANTS if (deplist != 0 && prevlbn >= adp->ad_offset) panic("softdep_write_inodeblock: lbn order"); prevlbn = adp->ad_offset; - if (adp->ad_offset < NDADDR && + if (adp->ad_offset < UFS_NDADDR && dp->di_db[adp->ad_offset] != adp->ad_newblkno) panic("%s: direct pointer #%jd mismatch %d != %jd", "softdep_write_inodeblock", (intmax_t)adp->ad_offset, dp->di_db[adp->ad_offset], (intmax_t)adp->ad_newblkno); - if (adp->ad_offset >= NDADDR && - dp->di_ib[adp->ad_offset - NDADDR] != adp->ad_newblkno) + if (adp->ad_offset >= UFS_NDADDR && + dp->di_ib[adp->ad_offset - UFS_NDADDR] != adp->ad_newblkno) panic("%s: indirect pointer #%jd mismatch %d != %jd", "softdep_write_inodeblock", - (intmax_t)adp->ad_offset - NDADDR, - dp->di_ib[adp->ad_offset - NDADDR], + (intmax_t)adp->ad_offset - UFS_NDADDR, + dp->di_ib[adp->ad_offset - UFS_NDADDR], (intmax_t)adp->ad_newblkno); deplist |= 1 << adp->ad_offset; if ((adp->ad_state & ATTACHED) == 0) panic("softdep_write_inodeblock: Unknown state 0x%x", adp->ad_state); #endif /* INVARIANTS */ adp->ad_state &= ~ATTACHED; adp->ad_state |= UNDONE; } /* * The on-disk inode cannot claim to be any larger than the last * fragment that has been written. Otherwise, the on-disk inode * might have fragments that were not the last block in the file * which would corrupt the filesystem. */ for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) { - if (adp->ad_offset >= NDADDR) + if (adp->ad_offset >= UFS_NDADDR) break; dp->di_db[adp->ad_offset] = adp->ad_oldblkno; /* keep going until hitting a rollback to a frag */ if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize) continue; dp->di_size = fs->fs_bsize * adp->ad_offset + adp->ad_oldsize; - for (i = adp->ad_offset + 1; i < NDADDR; i++) { + for (i = adp->ad_offset + 1; i < UFS_NDADDR; i++) { #ifdef INVARIANTS if (dp->di_db[i] != 0 && (deplist & (1 << i)) == 0) panic("softdep_write_inodeblock: lost dep1"); #endif /* INVARIANTS */ dp->di_db[i] = 0; } - for (i = 0; i < NIADDR; i++) { + for (i = 0; i < UFS_NIADDR; i++) { #ifdef INVARIANTS if (dp->di_ib[i] != 0 && - (deplist & ((1 << NDADDR) << i)) == 0) + (deplist & ((1 << UFS_NDADDR) << i)) == 0) panic("softdep_write_inodeblock: lost dep2"); #endif /* INVARIANTS */ dp->di_ib[i] = 0; } return; } /* * If we have zero'ed out the last allocated block of the file, * roll back the size to the last currently allocated block. * We know that this last allocated block is a full-sized as * we already checked for fragments in the loop above. */ if (lastadp != NULL && dp->di_size <= (lastadp->ad_offset + 1) * fs->fs_bsize) { for (i = lastadp->ad_offset; i >= 0; i--) if (dp->di_db[i] != 0) break; dp->di_size = (i + 1) * fs->fs_bsize; } /* * The only dependencies are for indirect blocks. * * The file size for indirect block additions is not guaranteed. * Such a guarantee would be non-trivial to achieve. The conventional * synchronous write implementation also does not make this guarantee. * Fsck should catch and fix discrepancies. Arguably, the file size * can be over-estimated without destroying integrity when the file * moves into the indirect blocks (i.e., is large). If we want to * postpone fsck, we are stuck with this argument. */ for (; adp; adp = TAILQ_NEXT(adp, ad_next)) - dp->di_ib[adp->ad_offset - NDADDR] = 0; + dp->di_ib[adp->ad_offset - UFS_NDADDR] = 0; } /* * Version of initiate_write_inodeblock that handles UFS2 dinodes. * Note that any bug fixes made to this routine must be done in the * version found above. * * Called from within the procedure above to deal with unsatisfied * allocation dependencies in an inodeblock. The buffer must be * locked, thus, no I/O completion operations can occur while we * are manipulating its associated dependencies. */ static void initiate_write_inodeblock_ufs2(inodedep, bp) struct inodedep *inodedep; struct buf *bp; /* The inode block */ { struct allocdirect *adp, *lastadp; struct ufs2_dinode *dp; struct ufs2_dinode *sip; struct inoref *inoref; struct ufsmount *ump; struct fs *fs; ufs_lbn_t i; #ifdef INVARIANTS ufs_lbn_t prevlbn = 0; #endif int deplist; if (inodedep->id_state & IOSTARTED) panic("initiate_write_inodeblock_ufs2: already started"); inodedep->id_state |= IOSTARTED; fs = inodedep->id_fs; ump = VFSTOUFS(inodedep->id_list.wk_mp); LOCK_OWNED(ump); dp = (struct ufs2_dinode *)bp->b_data + ino_to_fsbo(fs, inodedep->id_ino); /* * If we're on the unlinked list but have not yet written our * next pointer initialize it here. */ if ((inodedep->id_state & (UNLINKED | UNLINKNEXT)) == UNLINKED) { struct inodedep *inon; inon = TAILQ_NEXT(inodedep, id_unlinked); dp->di_freelink = inon ? inon->id_ino : 0; } /* * If the bitmap is not yet written, then the allocated * inode cannot be written to disk. */ if ((inodedep->id_state & DEPCOMPLETE) == 0) { if (inodedep->id_savedino2 != NULL) panic("initiate_write_inodeblock_ufs2: I/O underway"); FREE_LOCK(ump); sip = malloc(sizeof(struct ufs2_dinode), M_SAVEDINO, M_SOFTDEP_FLAGS); ACQUIRE_LOCK(ump); inodedep->id_savedino2 = sip; *inodedep->id_savedino2 = *dp; bzero((caddr_t)dp, sizeof(struct ufs2_dinode)); dp->di_gen = inodedep->id_savedino2->di_gen; dp->di_freelink = inodedep->id_savedino2->di_freelink; return; } /* * If no dependencies, then there is nothing to roll back. */ inodedep->id_savedsize = dp->di_size; inodedep->id_savedextsize = dp->di_extsize; inodedep->id_savednlink = dp->di_nlink; if (TAILQ_EMPTY(&inodedep->id_inoupdt) && TAILQ_EMPTY(&inodedep->id_extupdt) && TAILQ_EMPTY(&inodedep->id_inoreflst)) return; /* * Revert the link count to that of the first unwritten journal entry. */ inoref = TAILQ_FIRST(&inodedep->id_inoreflst); if (inoref) dp->di_nlink = inoref->if_nlink; /* * Set the ext data dependencies to busy. */ for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_extupdt); adp; adp = TAILQ_NEXT(adp, ad_next)) { #ifdef INVARIANTS if (deplist != 0 && prevlbn >= adp->ad_offset) panic("softdep_write_inodeblock: lbn order"); prevlbn = adp->ad_offset; if (dp->di_extb[adp->ad_offset] != adp->ad_newblkno) panic("%s: direct pointer #%jd mismatch %jd != %jd", "softdep_write_inodeblock", (intmax_t)adp->ad_offset, (intmax_t)dp->di_extb[adp->ad_offset], (intmax_t)adp->ad_newblkno); deplist |= 1 << adp->ad_offset; if ((adp->ad_state & ATTACHED) == 0) panic("softdep_write_inodeblock: Unknown state 0x%x", adp->ad_state); #endif /* INVARIANTS */ adp->ad_state &= ~ATTACHED; adp->ad_state |= UNDONE; } /* * The on-disk inode cannot claim to be any larger than the last * fragment that has been written. Otherwise, the on-disk inode * might have fragments that were not the last block in the ext * data which would corrupt the filesystem. */ for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_extupdt); adp; lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) { dp->di_extb[adp->ad_offset] = adp->ad_oldblkno; /* keep going until hitting a rollback to a frag */ if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize) continue; dp->di_extsize = fs->fs_bsize * adp->ad_offset + adp->ad_oldsize; - for (i = adp->ad_offset + 1; i < NXADDR; i++) { + for (i = adp->ad_offset + 1; i < UFS_NXADDR; i++) { #ifdef INVARIANTS if (dp->di_extb[i] != 0 && (deplist & (1 << i)) == 0) panic("softdep_write_inodeblock: lost dep1"); #endif /* INVARIANTS */ dp->di_extb[i] = 0; } lastadp = NULL; break; } /* * If we have zero'ed out the last allocated block of the ext * data, roll back the size to the last currently allocated block. * We know that this last allocated block is a full-sized as * we already checked for fragments in the loop above. */ if (lastadp != NULL && dp->di_extsize <= (lastadp->ad_offset + 1) * fs->fs_bsize) { for (i = lastadp->ad_offset; i >= 0; i--) if (dp->di_extb[i] != 0) break; dp->di_extsize = (i + 1) * fs->fs_bsize; } /* * Set the file data dependencies to busy. */ for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; adp = TAILQ_NEXT(adp, ad_next)) { #ifdef INVARIANTS if (deplist != 0 && prevlbn >= adp->ad_offset) panic("softdep_write_inodeblock: lbn order"); if ((adp->ad_state & ATTACHED) == 0) panic("inodedep %p and adp %p not attached", inodedep, adp); prevlbn = adp->ad_offset; - if (adp->ad_offset < NDADDR && + if (adp->ad_offset < UFS_NDADDR && dp->di_db[adp->ad_offset] != adp->ad_newblkno) panic("%s: direct pointer #%jd mismatch %jd != %jd", "softdep_write_inodeblock", (intmax_t)adp->ad_offset, (intmax_t)dp->di_db[adp->ad_offset], (intmax_t)adp->ad_newblkno); - if (adp->ad_offset >= NDADDR && - dp->di_ib[adp->ad_offset - NDADDR] != adp->ad_newblkno) + if (adp->ad_offset >= UFS_NDADDR && + dp->di_ib[adp->ad_offset - UFS_NDADDR] != adp->ad_newblkno) panic("%s indirect pointer #%jd mismatch %jd != %jd", "softdep_write_inodeblock:", - (intmax_t)adp->ad_offset - NDADDR, - (intmax_t)dp->di_ib[adp->ad_offset - NDADDR], + (intmax_t)adp->ad_offset - UFS_NDADDR, + (intmax_t)dp->di_ib[adp->ad_offset - UFS_NDADDR], (intmax_t)adp->ad_newblkno); deplist |= 1 << adp->ad_offset; if ((adp->ad_state & ATTACHED) == 0) panic("softdep_write_inodeblock: Unknown state 0x%x", adp->ad_state); #endif /* INVARIANTS */ adp->ad_state &= ~ATTACHED; adp->ad_state |= UNDONE; } /* * The on-disk inode cannot claim to be any larger than the last * fragment that has been written. Otherwise, the on-disk inode * might have fragments that were not the last block in the file * which would corrupt the filesystem. */ for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) { - if (adp->ad_offset >= NDADDR) + if (adp->ad_offset >= UFS_NDADDR) break; dp->di_db[adp->ad_offset] = adp->ad_oldblkno; /* keep going until hitting a rollback to a frag */ if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize) continue; dp->di_size = fs->fs_bsize * adp->ad_offset + adp->ad_oldsize; - for (i = adp->ad_offset + 1; i < NDADDR; i++) { + for (i = adp->ad_offset + 1; i < UFS_NDADDR; i++) { #ifdef INVARIANTS if (dp->di_db[i] != 0 && (deplist & (1 << i)) == 0) panic("softdep_write_inodeblock: lost dep2"); #endif /* INVARIANTS */ dp->di_db[i] = 0; } - for (i = 0; i < NIADDR; i++) { + for (i = 0; i < UFS_NIADDR; i++) { #ifdef INVARIANTS if (dp->di_ib[i] != 0 && - (deplist & ((1 << NDADDR) << i)) == 0) + (deplist & ((1 << UFS_NDADDR) << i)) == 0) panic("softdep_write_inodeblock: lost dep3"); #endif /* INVARIANTS */ dp->di_ib[i] = 0; } return; } /* * If we have zero'ed out the last allocated block of the file, * roll back the size to the last currently allocated block. * We know that this last allocated block is a full-sized as * we already checked for fragments in the loop above. */ if (lastadp != NULL && dp->di_size <= (lastadp->ad_offset + 1) * fs->fs_bsize) { for (i = lastadp->ad_offset; i >= 0; i--) if (dp->di_db[i] != 0) break; dp->di_size = (i + 1) * fs->fs_bsize; } /* * The only dependencies are for indirect blocks. * * The file size for indirect block additions is not guaranteed. * Such a guarantee would be non-trivial to achieve. The conventional * synchronous write implementation also does not make this guarantee. * Fsck should catch and fix discrepancies. Arguably, the file size * can be over-estimated without destroying integrity when the file * moves into the indirect blocks (i.e., is large). If we want to * postpone fsck, we are stuck with this argument. */ for (; adp; adp = TAILQ_NEXT(adp, ad_next)) - dp->di_ib[adp->ad_offset - NDADDR] = 0; + dp->di_ib[adp->ad_offset - UFS_NDADDR] = 0; } /* * Cancel an indirdep as a result of truncation. Release all of the * children allocindirs and place their journal work on the appropriate * list. */ static void cancel_indirdep(indirdep, bp, freeblks) struct indirdep *indirdep; struct buf *bp; struct freeblks *freeblks; { struct allocindir *aip; /* * None of the indirect pointers will ever be visible, * so they can simply be tossed. GOINGAWAY ensures * that allocated pointers will be saved in the buffer * cache until they are freed. Note that they will * only be able to be found by their physical address * since the inode mapping the logical address will * be gone. The save buffer used for the safe copy * was allocated in setup_allocindir_phase2 using * the physical address so it could be used for this * purpose. Hence we swap the safe copy with the real * copy, allowing the safe copy to be freed and holding * on to the real copy for later use in indir_trunc. */ if (indirdep->ir_state & GOINGAWAY) panic("cancel_indirdep: already gone"); if ((indirdep->ir_state & DEPCOMPLETE) == 0) { indirdep->ir_state |= DEPCOMPLETE; LIST_REMOVE(indirdep, ir_next); } indirdep->ir_state |= GOINGAWAY; /* * Pass in bp for blocks still have journal writes * pending so we can cancel them on their own. */ while ((aip = LIST_FIRST(&indirdep->ir_deplisthd)) != NULL) cancel_allocindir(aip, bp, freeblks, 0); while ((aip = LIST_FIRST(&indirdep->ir_donehd)) != NULL) cancel_allocindir(aip, NULL, freeblks, 0); while ((aip = LIST_FIRST(&indirdep->ir_writehd)) != NULL) cancel_allocindir(aip, NULL, freeblks, 0); while ((aip = LIST_FIRST(&indirdep->ir_completehd)) != NULL) cancel_allocindir(aip, NULL, freeblks, 0); /* * If there are pending partial truncations we need to keep the * old block copy around until they complete. This is because * the current b_data is not a perfect superset of the available * blocks. */ if (TAILQ_EMPTY(&indirdep->ir_trunc)) bcopy(bp->b_data, indirdep->ir_savebp->b_data, bp->b_bcount); else bcopy(bp->b_data, indirdep->ir_saveddata, bp->b_bcount); WORKLIST_REMOVE(&indirdep->ir_list); WORKLIST_INSERT(&indirdep->ir_savebp->b_dep, &indirdep->ir_list); indirdep->ir_bp = NULL; indirdep->ir_freeblks = freeblks; } /* * Free an indirdep once it no longer has new pointers to track. */ static void free_indirdep(indirdep) struct indirdep *indirdep; { KASSERT(TAILQ_EMPTY(&indirdep->ir_trunc), ("free_indirdep: Indir trunc list not empty.")); KASSERT(LIST_EMPTY(&indirdep->ir_completehd), ("free_indirdep: Complete head not empty.")); KASSERT(LIST_EMPTY(&indirdep->ir_writehd), ("free_indirdep: write head not empty.")); KASSERT(LIST_EMPTY(&indirdep->ir_donehd), ("free_indirdep: done head not empty.")); KASSERT(LIST_EMPTY(&indirdep->ir_deplisthd), ("free_indirdep: deplist head not empty.")); KASSERT((indirdep->ir_state & DEPCOMPLETE), ("free_indirdep: %p still on newblk list.", indirdep)); KASSERT(indirdep->ir_saveddata == NULL, ("free_indirdep: %p still has saved data.", indirdep)); if (indirdep->ir_state & ONWORKLIST) WORKLIST_REMOVE(&indirdep->ir_list); WORKITEM_FREE(indirdep, D_INDIRDEP); } /* * Called before a write to an indirdep. This routine is responsible for * rolling back pointers to a safe state which includes only those * allocindirs which have been completed. */ static void initiate_write_indirdep(indirdep, bp) struct indirdep *indirdep; struct buf *bp; { struct ufsmount *ump; indirdep->ir_state |= IOSTARTED; if (indirdep->ir_state & GOINGAWAY) panic("disk_io_initiation: indirdep gone"); /* * If there are no remaining dependencies, this will be writing * the real pointers. */ if (LIST_EMPTY(&indirdep->ir_deplisthd) && TAILQ_EMPTY(&indirdep->ir_trunc)) return; /* * Replace up-to-date version with safe version. */ if (indirdep->ir_saveddata == NULL) { ump = VFSTOUFS(indirdep->ir_list.wk_mp); LOCK_OWNED(ump); FREE_LOCK(ump); indirdep->ir_saveddata = malloc(bp->b_bcount, M_INDIRDEP, M_SOFTDEP_FLAGS); ACQUIRE_LOCK(ump); } indirdep->ir_state &= ~ATTACHED; indirdep->ir_state |= UNDONE; bcopy(bp->b_data, indirdep->ir_saveddata, bp->b_bcount); bcopy(indirdep->ir_savebp->b_data, bp->b_data, bp->b_bcount); } /* * Called when an inode has been cleared in a cg bitmap. This finally * eliminates any canceled jaddrefs */ void softdep_setup_inofree(mp, bp, ino, wkhd) struct mount *mp; struct buf *bp; ino_t ino; struct workhead *wkhd; { struct worklist *wk, *wkn; struct inodedep *inodedep; struct ufsmount *ump; uint8_t *inosused; struct cg *cgp; struct fs *fs; KASSERT(MOUNTEDSOFTDEP(mp) != 0, ("softdep_setup_inofree called on non-softdep filesystem")); ump = VFSTOUFS(mp); ACQUIRE_LOCK(ump); fs = ump->um_fs; cgp = (struct cg *)bp->b_data; inosused = cg_inosused(cgp); if (isset(inosused, ino % fs->fs_ipg)) panic("softdep_setup_inofree: inode %ju not freed.", (uintmax_t)ino); if (inodedep_lookup(mp, ino, 0, &inodedep)) panic("softdep_setup_inofree: ino %ju has existing inodedep %p", (uintmax_t)ino, inodedep); if (wkhd) { LIST_FOREACH_SAFE(wk, wkhd, wk_list, wkn) { if (wk->wk_type != D_JADDREF) continue; WORKLIST_REMOVE(wk); /* * We can free immediately even if the jaddref * isn't attached in a background write as now * the bitmaps are reconciled. */ wk->wk_state |= COMPLETE | ATTACHED; free_jaddref(WK_JADDREF(wk)); } jwork_move(&bp->b_dep, wkhd); } FREE_LOCK(ump); } /* * Called via ffs_blkfree() after a set of frags has been cleared from a cg * map. Any dependencies waiting for the write to clear are added to the * buf's list and any jnewblks that are being canceled are discarded * immediately. */ void softdep_setup_blkfree(mp, bp, blkno, frags, wkhd) struct mount *mp; struct buf *bp; ufs2_daddr_t blkno; int frags; struct workhead *wkhd; { struct bmsafemap *bmsafemap; struct jnewblk *jnewblk; struct ufsmount *ump; struct worklist *wk; struct fs *fs; #ifdef SUJ_DEBUG uint8_t *blksfree; struct cg *cgp; ufs2_daddr_t jstart; ufs2_daddr_t jend; ufs2_daddr_t end; long bno; int i; #endif CTR3(KTR_SUJ, "softdep_setup_blkfree: blkno %jd frags %d wk head %p", blkno, frags, wkhd); ump = VFSTOUFS(mp); KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(ump)) != 0, ("softdep_setup_blkfree called on non-softdep filesystem")); ACQUIRE_LOCK(ump); /* Lookup the bmsafemap so we track when it is dirty. */ fs = ump->um_fs; bmsafemap = bmsafemap_lookup(mp, bp, dtog(fs, blkno), NULL); /* * Detach any jnewblks which have been canceled. They must linger * until the bitmap is cleared again by ffs_blkfree() to prevent * an unjournaled allocation from hitting the disk. */ if (wkhd) { while ((wk = LIST_FIRST(wkhd)) != NULL) { CTR2(KTR_SUJ, "softdep_setup_blkfree: blkno %jd wk type %d", blkno, wk->wk_type); WORKLIST_REMOVE(wk); if (wk->wk_type != D_JNEWBLK) { WORKLIST_INSERT(&bmsafemap->sm_freehd, wk); continue; } jnewblk = WK_JNEWBLK(wk); KASSERT(jnewblk->jn_state & GOINGAWAY, ("softdep_setup_blkfree: jnewblk not canceled.")); #ifdef SUJ_DEBUG /* * Assert that this block is free in the bitmap * before we discard the jnewblk. */ cgp = (struct cg *)bp->b_data; blksfree = cg_blksfree(cgp); bno = dtogd(fs, jnewblk->jn_blkno); for (i = jnewblk->jn_oldfrags; i < jnewblk->jn_frags; i++) { if (isset(blksfree, bno + i)) continue; panic("softdep_setup_blkfree: not free"); } #endif /* * Even if it's not attached we can free immediately * as the new bitmap is correct. */ wk->wk_state |= COMPLETE | ATTACHED; free_jnewblk(jnewblk); } } #ifdef SUJ_DEBUG /* * Assert that we are not freeing a block which has an outstanding * allocation dependency. */ fs = VFSTOUFS(mp)->um_fs; bmsafemap = bmsafemap_lookup(mp, bp, dtog(fs, blkno), NULL); end = blkno + frags; LIST_FOREACH(jnewblk, &bmsafemap->sm_jnewblkhd, jn_deps) { /* * Don't match against blocks that will be freed when the * background write is done. */ if ((jnewblk->jn_state & (ATTACHED | COMPLETE | DEPCOMPLETE)) == (COMPLETE | DEPCOMPLETE)) continue; jstart = jnewblk->jn_blkno + jnewblk->jn_oldfrags; jend = jnewblk->jn_blkno + jnewblk->jn_frags; if ((blkno >= jstart && blkno < jend) || (end > jstart && end <= jend)) { printf("state 0x%X %jd - %d %d dep %p\n", jnewblk->jn_state, jnewblk->jn_blkno, jnewblk->jn_oldfrags, jnewblk->jn_frags, jnewblk->jn_dep); panic("softdep_setup_blkfree: " "%jd-%jd(%d) overlaps with %jd-%jd", blkno, end, frags, jstart, jend); } } #endif FREE_LOCK(ump); } /* * Revert a block allocation when the journal record that describes it * is not yet written. */ static int jnewblk_rollback(jnewblk, fs, cgp, blksfree) struct jnewblk *jnewblk; struct fs *fs; struct cg *cgp; uint8_t *blksfree; { ufs1_daddr_t fragno; long cgbno, bbase; int frags, blk; int i; frags = 0; cgbno = dtogd(fs, jnewblk->jn_blkno); /* * We have to test which frags need to be rolled back. We may * be operating on a stale copy when doing background writes. */ for (i = jnewblk->jn_oldfrags; i < jnewblk->jn_frags; i++) if (isclr(blksfree, cgbno + i)) frags++; if (frags == 0) return (0); /* * This is mostly ffs_blkfree() sans some validation and * superblock updates. */ if (frags == fs->fs_frag) { fragno = fragstoblks(fs, cgbno); ffs_setblock(fs, blksfree, fragno); ffs_clusteracct(fs, cgp, fragno, 1); cgp->cg_cs.cs_nbfree++; } else { cgbno += jnewblk->jn_oldfrags; bbase = cgbno - fragnum(fs, cgbno); /* Decrement the old frags. */ blk = blkmap(fs, blksfree, bbase); ffs_fragacct(fs, blk, cgp->cg_frsum, -1); /* Deallocate the fragment */ for (i = 0; i < frags; i++) setbit(blksfree, cgbno + i); cgp->cg_cs.cs_nffree += frags; /* Add back in counts associated with the new frags */ blk = blkmap(fs, blksfree, bbase); ffs_fragacct(fs, blk, cgp->cg_frsum, 1); /* If a complete block has been reassembled, account for it. */ fragno = fragstoblks(fs, bbase); if (ffs_isblock(fs, blksfree, fragno)) { cgp->cg_cs.cs_nffree -= fs->fs_frag; ffs_clusteracct(fs, cgp, fragno, 1); cgp->cg_cs.cs_nbfree++; } } stat_jnewblk++; jnewblk->jn_state &= ~ATTACHED; jnewblk->jn_state |= UNDONE; return (frags); } static void initiate_write_bmsafemap(bmsafemap, bp) struct bmsafemap *bmsafemap; struct buf *bp; /* The cg block. */ { struct jaddref *jaddref; struct jnewblk *jnewblk; uint8_t *inosused; uint8_t *blksfree; struct cg *cgp; struct fs *fs; ino_t ino; /* * If this is a background write, we did this at the time that * the copy was made, so do not need to do it again. */ if (bmsafemap->sm_state & IOSTARTED) return; bmsafemap->sm_state |= IOSTARTED; /* * Clear any inode allocations which are pending journal writes. */ if (LIST_FIRST(&bmsafemap->sm_jaddrefhd) != NULL) { cgp = (struct cg *)bp->b_data; fs = VFSTOUFS(bmsafemap->sm_list.wk_mp)->um_fs; inosused = cg_inosused(cgp); LIST_FOREACH(jaddref, &bmsafemap->sm_jaddrefhd, ja_bmdeps) { ino = jaddref->ja_ino % fs->fs_ipg; if (isset(inosused, ino)) { if ((jaddref->ja_mode & IFMT) == IFDIR) cgp->cg_cs.cs_ndir--; cgp->cg_cs.cs_nifree++; clrbit(inosused, ino); jaddref->ja_state &= ~ATTACHED; jaddref->ja_state |= UNDONE; stat_jaddref++; } else panic("initiate_write_bmsafemap: inode %ju " "marked free", (uintmax_t)jaddref->ja_ino); } } /* * Clear any block allocations which are pending journal writes. */ if (LIST_FIRST(&bmsafemap->sm_jnewblkhd) != NULL) { cgp = (struct cg *)bp->b_data; fs = VFSTOUFS(bmsafemap->sm_list.wk_mp)->um_fs; blksfree = cg_blksfree(cgp); LIST_FOREACH(jnewblk, &bmsafemap->sm_jnewblkhd, jn_deps) { if (jnewblk_rollback(jnewblk, fs, cgp, blksfree)) continue; panic("initiate_write_bmsafemap: block %jd " "marked free", jnewblk->jn_blkno); } } /* * Move allocation lists to the written lists so they can be * cleared once the block write is complete. */ LIST_SWAP(&bmsafemap->sm_inodedephd, &bmsafemap->sm_inodedepwr, inodedep, id_deps); LIST_SWAP(&bmsafemap->sm_newblkhd, &bmsafemap->sm_newblkwr, newblk, nb_deps); LIST_SWAP(&bmsafemap->sm_freehd, &bmsafemap->sm_freewr, worklist, wk_list); } /* * This routine is called during the completion interrupt * service routine for a disk write (from the procedure called * by the device driver to inform the filesystem caches of * a request completion). It should be called early in this * procedure, before the block is made available to other * processes or other routines are called. * */ static void softdep_disk_write_complete(bp) struct buf *bp; /* describes the completed disk write */ { struct worklist *wk; struct worklist *owk; struct ufsmount *ump; struct workhead reattach; struct freeblks *freeblks; struct buf *sbp; /* * If an error occurred while doing the write, then the data * has not hit the disk and the dependencies cannot be processed. * But we do have to go through and roll forward any dependencies * that were rolled back before the disk write. */ if ((bp->b_ioflags & BIO_ERROR) != 0 && (bp->b_flags & B_INVAL) == 0) { LIST_FOREACH(wk, &bp->b_dep, wk_list) { switch (wk->wk_type) { case D_PAGEDEP: handle_written_filepage(WK_PAGEDEP(wk), bp, 0); continue; case D_INODEDEP: handle_written_inodeblock(WK_INODEDEP(wk), bp, 0); continue; case D_BMSAFEMAP: handle_written_bmsafemap(WK_BMSAFEMAP(wk), bp, 0); continue; case D_INDIRDEP: handle_written_indirdep(WK_INDIRDEP(wk), bp, &sbp, 0); continue; default: /* nothing to roll forward */ continue; } } return; } if ((wk = LIST_FIRST(&bp->b_dep)) == NULL) return; ump = VFSTOUFS(wk->wk_mp); LIST_INIT(&reattach); /* * This lock must not be released anywhere in this code segment. */ sbp = NULL; owk = NULL; ACQUIRE_LOCK(ump); while ((wk = LIST_FIRST(&bp->b_dep)) != NULL) { WORKLIST_REMOVE(wk); atomic_add_long(&dep_write[wk->wk_type], 1); if (wk == owk) panic("duplicate worklist: %p\n", wk); owk = wk; switch (wk->wk_type) { case D_PAGEDEP: if (handle_written_filepage(WK_PAGEDEP(wk), bp, WRITESUCCEEDED)) WORKLIST_INSERT(&reattach, wk); continue; case D_INODEDEP: if (handle_written_inodeblock(WK_INODEDEP(wk), bp, WRITESUCCEEDED)) WORKLIST_INSERT(&reattach, wk); continue; case D_BMSAFEMAP: if (handle_written_bmsafemap(WK_BMSAFEMAP(wk), bp, WRITESUCCEEDED)) WORKLIST_INSERT(&reattach, wk); continue; case D_MKDIR: handle_written_mkdir(WK_MKDIR(wk), MKDIR_BODY); continue; case D_ALLOCDIRECT: wk->wk_state |= COMPLETE; handle_allocdirect_partdone(WK_ALLOCDIRECT(wk), NULL); continue; case D_ALLOCINDIR: wk->wk_state |= COMPLETE; handle_allocindir_partdone(WK_ALLOCINDIR(wk)); continue; case D_INDIRDEP: if (handle_written_indirdep(WK_INDIRDEP(wk), bp, &sbp, WRITESUCCEEDED)) WORKLIST_INSERT(&reattach, wk); continue; case D_FREEBLKS: wk->wk_state |= COMPLETE; freeblks = WK_FREEBLKS(wk); if ((wk->wk_state & ALLCOMPLETE) == ALLCOMPLETE && LIST_EMPTY(&freeblks->fb_jblkdephd)) add_to_worklist(wk, WK_NODELAY); continue; case D_FREEWORK: handle_written_freework(WK_FREEWORK(wk)); break; case D_JSEGDEP: free_jsegdep(WK_JSEGDEP(wk)); continue; case D_JSEG: handle_written_jseg(WK_JSEG(wk), bp); continue; case D_SBDEP: if (handle_written_sbdep(WK_SBDEP(wk), bp)) WORKLIST_INSERT(&reattach, wk); continue; case D_FREEDEP: free_freedep(WK_FREEDEP(wk)); continue; default: panic("handle_disk_write_complete: Unknown type %s", TYPENAME(wk->wk_type)); /* NOTREACHED */ } } /* * Reattach any requests that must be redone. */ while ((wk = LIST_FIRST(&reattach)) != NULL) { WORKLIST_REMOVE(wk); WORKLIST_INSERT(&bp->b_dep, wk); } FREE_LOCK(ump); if (sbp) brelse(sbp); } /* * Called from within softdep_disk_write_complete above. Note that * this routine is always called from interrupt level with further * splbio interrupts blocked. */ static void handle_allocdirect_partdone(adp, wkhd) struct allocdirect *adp; /* the completed allocdirect */ struct workhead *wkhd; /* Work to do when inode is writtne. */ { struct allocdirectlst *listhead; struct allocdirect *listadp; struct inodedep *inodedep; long bsize; if ((adp->ad_state & ALLCOMPLETE) != ALLCOMPLETE) return; /* * The on-disk inode cannot claim to be any larger than the last * fragment that has been written. Otherwise, the on-disk inode * might have fragments that were not the last block in the file * which would corrupt the filesystem. Thus, we cannot free any * allocdirects after one whose ad_oldblkno claims a fragment as * these blocks must be rolled back to zero before writing the inode. * We check the currently active set of allocdirects in id_inoupdt * or id_extupdt as appropriate. */ inodedep = adp->ad_inodedep; bsize = inodedep->id_fs->fs_bsize; if (adp->ad_state & EXTDATA) listhead = &inodedep->id_extupdt; else listhead = &inodedep->id_inoupdt; TAILQ_FOREACH(listadp, listhead, ad_next) { /* found our block */ if (listadp == adp) break; /* continue if ad_oldlbn is not a fragment */ if (listadp->ad_oldsize == 0 || listadp->ad_oldsize == bsize) continue; /* hit a fragment */ return; } /* * If we have reached the end of the current list without * finding the just finished dependency, then it must be * on the future dependency list. Future dependencies cannot * be freed until they are moved to the current list. */ if (listadp == NULL) { #ifdef DEBUG if (adp->ad_state & EXTDATA) listhead = &inodedep->id_newextupdt; else listhead = &inodedep->id_newinoupdt; TAILQ_FOREACH(listadp, listhead, ad_next) /* found our block */ if (listadp == adp) break; if (listadp == NULL) panic("handle_allocdirect_partdone: lost dep"); #endif /* DEBUG */ return; } /* * If we have found the just finished dependency, then queue * it along with anything that follows it that is complete. * Since the pointer has not yet been written in the inode * as the dependency prevents it, place the allocdirect on the * bufwait list where it will be freed once the pointer is * valid. */ if (wkhd == NULL) wkhd = &inodedep->id_bufwait; for (; adp; adp = listadp) { listadp = TAILQ_NEXT(adp, ad_next); if ((adp->ad_state & ALLCOMPLETE) != ALLCOMPLETE) return; TAILQ_REMOVE(listhead, adp, ad_next); WORKLIST_INSERT(wkhd, &adp->ad_block.nb_list); } } /* * Called from within softdep_disk_write_complete above. This routine * completes successfully written allocindirs. */ static void handle_allocindir_partdone(aip) struct allocindir *aip; /* the completed allocindir */ { struct indirdep *indirdep; if ((aip->ai_state & ALLCOMPLETE) != ALLCOMPLETE) return; indirdep = aip->ai_indirdep; LIST_REMOVE(aip, ai_next); /* * Don't set a pointer while the buffer is undergoing IO or while * we have active truncations. */ if (indirdep->ir_state & UNDONE || !TAILQ_EMPTY(&indirdep->ir_trunc)) { LIST_INSERT_HEAD(&indirdep->ir_donehd, aip, ai_next); return; } if (indirdep->ir_state & UFS1FMT) ((ufs1_daddr_t *)indirdep->ir_savebp->b_data)[aip->ai_offset] = aip->ai_newblkno; else ((ufs2_daddr_t *)indirdep->ir_savebp->b_data)[aip->ai_offset] = aip->ai_newblkno; /* * Await the pointer write before freeing the allocindir. */ LIST_INSERT_HEAD(&indirdep->ir_writehd, aip, ai_next); } /* * Release segments held on a jwork list. */ static void handle_jwork(wkhd) struct workhead *wkhd; { struct worklist *wk; while ((wk = LIST_FIRST(wkhd)) != NULL) { WORKLIST_REMOVE(wk); switch (wk->wk_type) { case D_JSEGDEP: free_jsegdep(WK_JSEGDEP(wk)); continue; case D_FREEDEP: free_freedep(WK_FREEDEP(wk)); continue; case D_FREEFRAG: rele_jseg(WK_JSEG(WK_FREEFRAG(wk)->ff_jdep)); WORKITEM_FREE(wk, D_FREEFRAG); continue; case D_FREEWORK: handle_written_freework(WK_FREEWORK(wk)); continue; default: panic("handle_jwork: Unknown type %s\n", TYPENAME(wk->wk_type)); } } } /* * Handle the bufwait list on an inode when it is safe to release items * held there. This normally happens after an inode block is written but * may be delayed and handled later if there are pending journal items that * are not yet safe to be released. */ static struct freefile * handle_bufwait(inodedep, refhd) struct inodedep *inodedep; struct workhead *refhd; { struct jaddref *jaddref; struct freefile *freefile; struct worklist *wk; freefile = NULL; while ((wk = LIST_FIRST(&inodedep->id_bufwait)) != NULL) { WORKLIST_REMOVE(wk); switch (wk->wk_type) { case D_FREEFILE: /* * We defer adding freefile to the worklist * until all other additions have been made to * ensure that it will be done after all the * old blocks have been freed. */ if (freefile != NULL) panic("handle_bufwait: freefile"); freefile = WK_FREEFILE(wk); continue; case D_MKDIR: handle_written_mkdir(WK_MKDIR(wk), MKDIR_PARENT); continue; case D_DIRADD: diradd_inode_written(WK_DIRADD(wk), inodedep); continue; case D_FREEFRAG: wk->wk_state |= COMPLETE; if ((wk->wk_state & ALLCOMPLETE) == ALLCOMPLETE) add_to_worklist(wk, 0); continue; case D_DIRREM: wk->wk_state |= COMPLETE; add_to_worklist(wk, 0); continue; case D_ALLOCDIRECT: case D_ALLOCINDIR: free_newblk(WK_NEWBLK(wk)); continue; case D_JNEWBLK: wk->wk_state |= COMPLETE; free_jnewblk(WK_JNEWBLK(wk)); continue; /* * Save freed journal segments and add references on * the supplied list which will delay their release * until the cg bitmap is cleared on disk. */ case D_JSEGDEP: if (refhd == NULL) free_jsegdep(WK_JSEGDEP(wk)); else WORKLIST_INSERT(refhd, wk); continue; case D_JADDREF: jaddref = WK_JADDREF(wk); TAILQ_REMOVE(&inodedep->id_inoreflst, &jaddref->ja_ref, if_deps); /* * Transfer any jaddrefs to the list to be freed with * the bitmap if we're handling a removed file. */ if (refhd == NULL) { wk->wk_state |= COMPLETE; free_jaddref(jaddref); } else WORKLIST_INSERT(refhd, wk); continue; default: panic("handle_bufwait: Unknown type %p(%s)", wk, TYPENAME(wk->wk_type)); /* NOTREACHED */ } } return (freefile); } /* * Called from within softdep_disk_write_complete above to restore * in-memory inode block contents to their most up-to-date state. Note * that this routine is always called from interrupt level with further * interrupts from this device blocked. * * If the write did not succeed, we will do all the roll-forward * operations, but we will not take the actions that will allow its * dependencies to be processed. */ static int handle_written_inodeblock(inodedep, bp, flags) struct inodedep *inodedep; struct buf *bp; /* buffer containing the inode block */ int flags; { struct freefile *freefile; struct allocdirect *adp, *nextadp; struct ufs1_dinode *dp1 = NULL; struct ufs2_dinode *dp2 = NULL; struct workhead wkhd; int hadchanges, fstype; ino_t freelink; LIST_INIT(&wkhd); hadchanges = 0; freefile = NULL; if ((inodedep->id_state & IOSTARTED) == 0) panic("handle_written_inodeblock: not started"); inodedep->id_state &= ~IOSTARTED; if (inodedep->id_fs->fs_magic == FS_UFS1_MAGIC) { fstype = UFS1; dp1 = (struct ufs1_dinode *)bp->b_data + ino_to_fsbo(inodedep->id_fs, inodedep->id_ino); freelink = dp1->di_freelink; } else { fstype = UFS2; dp2 = (struct ufs2_dinode *)bp->b_data + ino_to_fsbo(inodedep->id_fs, inodedep->id_ino); freelink = dp2->di_freelink; } /* * Leave this inodeblock dirty until it's in the list. */ if ((inodedep->id_state & (UNLINKED | UNLINKONLIST)) == UNLINKED && (flags & WRITESUCCEEDED)) { struct inodedep *inon; inon = TAILQ_NEXT(inodedep, id_unlinked); if ((inon == NULL && freelink == 0) || (inon && inon->id_ino == freelink)) { if (inon) inon->id_state |= UNLINKPREV; inodedep->id_state |= UNLINKNEXT; } hadchanges = 1; } /* * If we had to rollback the inode allocation because of * bitmaps being incomplete, then simply restore it. * Keep the block dirty so that it will not be reclaimed until * all associated dependencies have been cleared and the * corresponding updates written to disk. */ if (inodedep->id_savedino1 != NULL) { hadchanges = 1; if (fstype == UFS1) *dp1 = *inodedep->id_savedino1; else *dp2 = *inodedep->id_savedino2; free(inodedep->id_savedino1, M_SAVEDINO); inodedep->id_savedino1 = NULL; if ((bp->b_flags & B_DELWRI) == 0) stat_inode_bitmap++; bdirty(bp); /* * If the inode is clear here and GOINGAWAY it will never * be written. Process the bufwait and clear any pending * work which may include the freefile. */ if (inodedep->id_state & GOINGAWAY) goto bufwait; return (1); } if (flags & WRITESUCCEEDED) inodedep->id_state |= COMPLETE; /* * Roll forward anything that had to be rolled back before * the inode could be updated. */ for (adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; adp = nextadp) { nextadp = TAILQ_NEXT(adp, ad_next); if (adp->ad_state & ATTACHED) panic("handle_written_inodeblock: new entry"); if (fstype == UFS1) { - if (adp->ad_offset < NDADDR) { + if (adp->ad_offset < UFS_NDADDR) { if (dp1->di_db[adp->ad_offset]!=adp->ad_oldblkno) panic("%s %s #%jd mismatch %d != %jd", "handle_written_inodeblock:", "direct pointer", (intmax_t)adp->ad_offset, dp1->di_db[adp->ad_offset], (intmax_t)adp->ad_oldblkno); dp1->di_db[adp->ad_offset] = adp->ad_newblkno; } else { - if (dp1->di_ib[adp->ad_offset - NDADDR] != 0) + if (dp1->di_ib[adp->ad_offset - UFS_NDADDR] != + 0) panic("%s: %s #%jd allocated as %d", "handle_written_inodeblock", "indirect pointer", - (intmax_t)adp->ad_offset - NDADDR, - dp1->di_ib[adp->ad_offset - NDADDR]); - dp1->di_ib[adp->ad_offset - NDADDR] = + (intmax_t)adp->ad_offset - + UFS_NDADDR, + dp1->di_ib[adp->ad_offset - + UFS_NDADDR]); + dp1->di_ib[adp->ad_offset - UFS_NDADDR] = adp->ad_newblkno; } } else { - if (adp->ad_offset < NDADDR) { + if (adp->ad_offset < UFS_NDADDR) { if (dp2->di_db[adp->ad_offset]!=adp->ad_oldblkno) panic("%s: %s #%jd %s %jd != %jd", "handle_written_inodeblock", "direct pointer", (intmax_t)adp->ad_offset, "mismatch", (intmax_t)dp2->di_db[adp->ad_offset], (intmax_t)adp->ad_oldblkno); dp2->di_db[adp->ad_offset] = adp->ad_newblkno; } else { - if (dp2->di_ib[adp->ad_offset - NDADDR] != 0) + if (dp2->di_ib[adp->ad_offset - UFS_NDADDR] != + 0) panic("%s: %s #%jd allocated as %jd", "handle_written_inodeblock", "indirect pointer", - (intmax_t)adp->ad_offset - NDADDR, + (intmax_t)adp->ad_offset - + UFS_NDADDR, (intmax_t) - dp2->di_ib[adp->ad_offset - NDADDR]); - dp2->di_ib[adp->ad_offset - NDADDR] = + dp2->di_ib[adp->ad_offset - + UFS_NDADDR]); + dp2->di_ib[adp->ad_offset - UFS_NDADDR] = adp->ad_newblkno; } } adp->ad_state &= ~UNDONE; adp->ad_state |= ATTACHED; hadchanges = 1; } for (adp = TAILQ_FIRST(&inodedep->id_extupdt); adp; adp = nextadp) { nextadp = TAILQ_NEXT(adp, ad_next); if (adp->ad_state & ATTACHED) panic("handle_written_inodeblock: new entry"); if (dp2->di_extb[adp->ad_offset] != adp->ad_oldblkno) panic("%s: direct pointers #%jd %s %jd != %jd", "handle_written_inodeblock", (intmax_t)adp->ad_offset, "mismatch", (intmax_t)dp2->di_extb[adp->ad_offset], (intmax_t)adp->ad_oldblkno); dp2->di_extb[adp->ad_offset] = adp->ad_newblkno; adp->ad_state &= ~UNDONE; adp->ad_state |= ATTACHED; hadchanges = 1; } if (hadchanges && (bp->b_flags & B_DELWRI) == 0) stat_direct_blk_ptrs++; /* * Reset the file size to its most up-to-date value. */ if (inodedep->id_savedsize == -1 || inodedep->id_savedextsize == -1) panic("handle_written_inodeblock: bad size"); if (inodedep->id_savednlink > LINK_MAX) panic("handle_written_inodeblock: Invalid link count " "%jd for inodedep %p", (uintmax_t)inodedep->id_savednlink, inodedep); if (fstype == UFS1) { if (dp1->di_nlink != inodedep->id_savednlink) { dp1->di_nlink = inodedep->id_savednlink; hadchanges = 1; } if (dp1->di_size != inodedep->id_savedsize) { dp1->di_size = inodedep->id_savedsize; hadchanges = 1; } } else { if (dp2->di_nlink != inodedep->id_savednlink) { dp2->di_nlink = inodedep->id_savednlink; hadchanges = 1; } if (dp2->di_size != inodedep->id_savedsize) { dp2->di_size = inodedep->id_savedsize; hadchanges = 1; } if (dp2->di_extsize != inodedep->id_savedextsize) { dp2->di_extsize = inodedep->id_savedextsize; hadchanges = 1; } } inodedep->id_savedsize = -1; inodedep->id_savedextsize = -1; inodedep->id_savednlink = -1; /* * If there were any rollbacks in the inode block, then it must be * marked dirty so that its will eventually get written back in * its correct form. */ if (hadchanges) bdirty(bp); bufwait: /* * If the write did not succeed, we have done all the roll-forward * operations, but we cannot take the actions that will allow its * dependencies to be processed. */ if ((flags & WRITESUCCEEDED) == 0) return (hadchanges); /* * Process any allocdirects that completed during the update. */ if ((adp = TAILQ_FIRST(&inodedep->id_inoupdt)) != NULL) handle_allocdirect_partdone(adp, &wkhd); if ((adp = TAILQ_FIRST(&inodedep->id_extupdt)) != NULL) handle_allocdirect_partdone(adp, &wkhd); /* * Process deallocations that were held pending until the * inode had been written to disk. Freeing of the inode * is delayed until after all blocks have been freed to * avoid creation of new triples * before the old ones have been deleted. Completely * unlinked inodes are not processed until the unlinked * inode list is written or the last reference is removed. */ if ((inodedep->id_state & (UNLINKED | UNLINKONLIST)) != UNLINKED) { freefile = handle_bufwait(inodedep, NULL); if (freefile && !LIST_EMPTY(&wkhd)) { WORKLIST_INSERT(&wkhd, &freefile->fx_list); freefile = NULL; } } /* * Move rolled forward dependency completions to the bufwait list * now that those that were already written have been processed. */ if (!LIST_EMPTY(&wkhd) && hadchanges == 0) panic("handle_written_inodeblock: bufwait but no changes"); jwork_move(&inodedep->id_bufwait, &wkhd); if (freefile != NULL) { /* * If the inode is goingaway it was never written. Fake up * the state here so free_inodedep() can succeed. */ if (inodedep->id_state & GOINGAWAY) inodedep->id_state |= COMPLETE | DEPCOMPLETE; if (free_inodedep(inodedep) == 0) panic("handle_written_inodeblock: live inodedep %p", inodedep); add_to_worklist(&freefile->fx_list, 0); return (0); } /* * If no outstanding dependencies, free it. */ if (free_inodedep(inodedep) || (TAILQ_FIRST(&inodedep->id_inoreflst) == 0 && TAILQ_FIRST(&inodedep->id_inoupdt) == 0 && TAILQ_FIRST(&inodedep->id_extupdt) == 0 && LIST_FIRST(&inodedep->id_bufwait) == 0)) return (0); return (hadchanges); } /* * Perform needed roll-forwards and kick off any dependencies that * can now be processed. * * If the write did not succeed, we will do all the roll-forward * operations, but we will not take the actions that will allow its * dependencies to be processed. */ static int handle_written_indirdep(indirdep, bp, bpp, flags) struct indirdep *indirdep; struct buf *bp; struct buf **bpp; int flags; { struct allocindir *aip; struct buf *sbp; int chgs; if (indirdep->ir_state & GOINGAWAY) panic("handle_written_indirdep: indirdep gone"); if ((indirdep->ir_state & IOSTARTED) == 0) panic("handle_written_indirdep: IO not started"); chgs = 0; /* * If there were rollbacks revert them here. */ if (indirdep->ir_saveddata) { bcopy(indirdep->ir_saveddata, bp->b_data, bp->b_bcount); if (TAILQ_EMPTY(&indirdep->ir_trunc)) { free(indirdep->ir_saveddata, M_INDIRDEP); indirdep->ir_saveddata = NULL; } chgs = 1; } indirdep->ir_state &= ~(UNDONE | IOSTARTED); indirdep->ir_state |= ATTACHED; /* * If the write did not succeed, we have done all the roll-forward * operations, but we cannot take the actions that will allow its * dependencies to be processed. */ if ((flags & WRITESUCCEEDED) == 0) { stat_indir_blk_ptrs++; bdirty(bp); return (1); } /* * Move allocindirs with written pointers to the completehd if * the indirdep's pointer is not yet written. Otherwise * free them here. */ while ((aip = LIST_FIRST(&indirdep->ir_writehd)) != NULL) { LIST_REMOVE(aip, ai_next); if ((indirdep->ir_state & DEPCOMPLETE) == 0) { LIST_INSERT_HEAD(&indirdep->ir_completehd, aip, ai_next); newblk_freefrag(&aip->ai_block); continue; } free_newblk(&aip->ai_block); } /* * Move allocindirs that have finished dependency processing from * the done list to the write list after updating the pointers. */ if (TAILQ_EMPTY(&indirdep->ir_trunc)) { while ((aip = LIST_FIRST(&indirdep->ir_donehd)) != NULL) { handle_allocindir_partdone(aip); if (aip == LIST_FIRST(&indirdep->ir_donehd)) panic("disk_write_complete: not gone"); chgs = 1; } } /* * Preserve the indirdep if there were any changes or if it is not * yet valid on disk. */ if (chgs) { stat_indir_blk_ptrs++; bdirty(bp); return (1); } /* * If there were no changes we can discard the savedbp and detach * ourselves from the buf. We are only carrying completed pointers * in this case. */ sbp = indirdep->ir_savebp; sbp->b_flags |= B_INVAL | B_NOCACHE; indirdep->ir_savebp = NULL; indirdep->ir_bp = NULL; if (*bpp != NULL) panic("handle_written_indirdep: bp already exists."); *bpp = sbp; /* * The indirdep may not be freed until its parent points at it. */ if (indirdep->ir_state & DEPCOMPLETE) free_indirdep(indirdep); return (0); } /* * Process a diradd entry after its dependent inode has been written. * This routine must be called with splbio interrupts blocked. */ static void diradd_inode_written(dap, inodedep) struct diradd *dap; struct inodedep *inodedep; { dap->da_state |= COMPLETE; complete_diradd(dap); WORKLIST_INSERT(&inodedep->id_pendinghd, &dap->da_list); } /* * Returns true if the bmsafemap will have rollbacks when written. Must only * be called with the per-filesystem lock and the buf lock on the cg held. */ static int bmsafemap_backgroundwrite(bmsafemap, bp) struct bmsafemap *bmsafemap; struct buf *bp; { int dirty; LOCK_OWNED(VFSTOUFS(bmsafemap->sm_list.wk_mp)); dirty = !LIST_EMPTY(&bmsafemap->sm_jaddrefhd) | !LIST_EMPTY(&bmsafemap->sm_jnewblkhd); /* * If we're initiating a background write we need to process the * rollbacks as they exist now, not as they exist when IO starts. * No other consumers will look at the contents of the shadowed * buf so this is safe to do here. */ if (bp->b_xflags & BX_BKGRDMARKER) initiate_write_bmsafemap(bmsafemap, bp); return (dirty); } /* * Re-apply an allocation when a cg write is complete. */ static int jnewblk_rollforward(jnewblk, fs, cgp, blksfree) struct jnewblk *jnewblk; struct fs *fs; struct cg *cgp; uint8_t *blksfree; { ufs1_daddr_t fragno; ufs2_daddr_t blkno; long cgbno, bbase; int frags, blk; int i; frags = 0; cgbno = dtogd(fs, jnewblk->jn_blkno); for (i = jnewblk->jn_oldfrags; i < jnewblk->jn_frags; i++) { if (isclr(blksfree, cgbno + i)) panic("jnewblk_rollforward: re-allocated fragment"); frags++; } if (frags == fs->fs_frag) { blkno = fragstoblks(fs, cgbno); ffs_clrblock(fs, blksfree, (long)blkno); ffs_clusteracct(fs, cgp, blkno, -1); cgp->cg_cs.cs_nbfree--; } else { bbase = cgbno - fragnum(fs, cgbno); cgbno += jnewblk->jn_oldfrags; /* If a complete block had been reassembled, account for it. */ fragno = fragstoblks(fs, bbase); if (ffs_isblock(fs, blksfree, fragno)) { cgp->cg_cs.cs_nffree += fs->fs_frag; ffs_clusteracct(fs, cgp, fragno, -1); cgp->cg_cs.cs_nbfree--; } /* Decrement the old frags. */ blk = blkmap(fs, blksfree, bbase); ffs_fragacct(fs, blk, cgp->cg_frsum, -1); /* Allocate the fragment */ for (i = 0; i < frags; i++) clrbit(blksfree, cgbno + i); cgp->cg_cs.cs_nffree -= frags; /* Add back in counts associated with the new frags */ blk = blkmap(fs, blksfree, bbase); ffs_fragacct(fs, blk, cgp->cg_frsum, 1); } return (frags); } /* * Complete a write to a bmsafemap structure. Roll forward any bitmap * changes if it's not a background write. Set all written dependencies * to DEPCOMPLETE and free the structure if possible. * * If the write did not succeed, we will do all the roll-forward * operations, but we will not take the actions that will allow its * dependencies to be processed. */ static int handle_written_bmsafemap(bmsafemap, bp, flags) struct bmsafemap *bmsafemap; struct buf *bp; int flags; { struct newblk *newblk; struct inodedep *inodedep; struct jaddref *jaddref, *jatmp; struct jnewblk *jnewblk, *jntmp; struct ufsmount *ump; uint8_t *inosused; uint8_t *blksfree; struct cg *cgp; struct fs *fs; ino_t ino; int foreground; int chgs; if ((bmsafemap->sm_state & IOSTARTED) == 0) panic("handle_written_bmsafemap: Not started\n"); ump = VFSTOUFS(bmsafemap->sm_list.wk_mp); chgs = 0; bmsafemap->sm_state &= ~IOSTARTED; foreground = (bp->b_xflags & BX_BKGRDMARKER) == 0; /* * If write was successful, release journal work that was waiting * on the write. Otherwise move the work back. */ if (flags & WRITESUCCEEDED) handle_jwork(&bmsafemap->sm_freewr); else LIST_CONCAT(&bmsafemap->sm_freehd, &bmsafemap->sm_freewr, worklist, wk_list); /* * Restore unwritten inode allocation pending jaddref writes. */ if (!LIST_EMPTY(&bmsafemap->sm_jaddrefhd)) { cgp = (struct cg *)bp->b_data; fs = VFSTOUFS(bmsafemap->sm_list.wk_mp)->um_fs; inosused = cg_inosused(cgp); LIST_FOREACH_SAFE(jaddref, &bmsafemap->sm_jaddrefhd, ja_bmdeps, jatmp) { if ((jaddref->ja_state & UNDONE) == 0) continue; ino = jaddref->ja_ino % fs->fs_ipg; if (isset(inosused, ino)) panic("handle_written_bmsafemap: " "re-allocated inode"); /* Do the roll-forward only if it's a real copy. */ if (foreground) { if ((jaddref->ja_mode & IFMT) == IFDIR) cgp->cg_cs.cs_ndir++; cgp->cg_cs.cs_nifree--; setbit(inosused, ino); chgs = 1; } jaddref->ja_state &= ~UNDONE; jaddref->ja_state |= ATTACHED; free_jaddref(jaddref); } } /* * Restore any block allocations which are pending journal writes. */ if (LIST_FIRST(&bmsafemap->sm_jnewblkhd) != NULL) { cgp = (struct cg *)bp->b_data; fs = VFSTOUFS(bmsafemap->sm_list.wk_mp)->um_fs; blksfree = cg_blksfree(cgp); LIST_FOREACH_SAFE(jnewblk, &bmsafemap->sm_jnewblkhd, jn_deps, jntmp) { if ((jnewblk->jn_state & UNDONE) == 0) continue; /* Do the roll-forward only if it's a real copy. */ if (foreground && jnewblk_rollforward(jnewblk, fs, cgp, blksfree)) chgs = 1; jnewblk->jn_state &= ~(UNDONE | NEWBLOCK); jnewblk->jn_state |= ATTACHED; free_jnewblk(jnewblk); } } /* * If the write did not succeed, we have done all the roll-forward * operations, but we cannot take the actions that will allow its * dependencies to be processed. */ if ((flags & WRITESUCCEEDED) == 0) { LIST_CONCAT(&bmsafemap->sm_newblkhd, &bmsafemap->sm_newblkwr, newblk, nb_deps); LIST_CONCAT(&bmsafemap->sm_freehd, &bmsafemap->sm_freewr, worklist, wk_list); if (foreground) bdirty(bp); return (1); } while ((newblk = LIST_FIRST(&bmsafemap->sm_newblkwr))) { newblk->nb_state |= DEPCOMPLETE; newblk->nb_state &= ~ONDEPLIST; newblk->nb_bmsafemap = NULL; LIST_REMOVE(newblk, nb_deps); if (newblk->nb_list.wk_type == D_ALLOCDIRECT) handle_allocdirect_partdone( WK_ALLOCDIRECT(&newblk->nb_list), NULL); else if (newblk->nb_list.wk_type == D_ALLOCINDIR) handle_allocindir_partdone( WK_ALLOCINDIR(&newblk->nb_list)); else if (newblk->nb_list.wk_type != D_NEWBLK) panic("handle_written_bmsafemap: Unexpected type: %s", TYPENAME(newblk->nb_list.wk_type)); } while ((inodedep = LIST_FIRST(&bmsafemap->sm_inodedepwr)) != NULL) { inodedep->id_state |= DEPCOMPLETE; inodedep->id_state &= ~ONDEPLIST; LIST_REMOVE(inodedep, id_deps); inodedep->id_bmsafemap = NULL; } LIST_REMOVE(bmsafemap, sm_next); if (chgs == 0 && LIST_EMPTY(&bmsafemap->sm_jaddrefhd) && LIST_EMPTY(&bmsafemap->sm_jnewblkhd) && LIST_EMPTY(&bmsafemap->sm_newblkhd) && LIST_EMPTY(&bmsafemap->sm_inodedephd) && LIST_EMPTY(&bmsafemap->sm_freehd)) { LIST_REMOVE(bmsafemap, sm_hash); WORKITEM_FREE(bmsafemap, D_BMSAFEMAP); return (0); } LIST_INSERT_HEAD(&ump->softdep_dirtycg, bmsafemap, sm_next); if (foreground) bdirty(bp); return (1); } /* * Try to free a mkdir dependency. */ static void complete_mkdir(mkdir) struct mkdir *mkdir; { struct diradd *dap; if ((mkdir->md_state & ALLCOMPLETE) != ALLCOMPLETE) return; LIST_REMOVE(mkdir, md_mkdirs); dap = mkdir->md_diradd; dap->da_state &= ~(mkdir->md_state & (MKDIR_PARENT | MKDIR_BODY)); if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) == 0) { dap->da_state |= DEPCOMPLETE; complete_diradd(dap); } WORKITEM_FREE(mkdir, D_MKDIR); } /* * Handle the completion of a mkdir dependency. */ static void handle_written_mkdir(mkdir, type) struct mkdir *mkdir; int type; { if ((mkdir->md_state & (MKDIR_PARENT | MKDIR_BODY)) != type) panic("handle_written_mkdir: bad type"); mkdir->md_state |= COMPLETE; complete_mkdir(mkdir); } static int free_pagedep(pagedep) struct pagedep *pagedep; { int i; if (pagedep->pd_state & NEWBLOCK) return (0); if (!LIST_EMPTY(&pagedep->pd_dirremhd)) return (0); for (i = 0; i < DAHASHSZ; i++) if (!LIST_EMPTY(&pagedep->pd_diraddhd[i])) return (0); if (!LIST_EMPTY(&pagedep->pd_pendinghd)) return (0); if (!LIST_EMPTY(&pagedep->pd_jmvrefhd)) return (0); if (pagedep->pd_state & ONWORKLIST) WORKLIST_REMOVE(&pagedep->pd_list); LIST_REMOVE(pagedep, pd_hash); WORKITEM_FREE(pagedep, D_PAGEDEP); return (1); } /* * Called from within softdep_disk_write_complete above. * A write operation was just completed. Removed inodes can * now be freed and associated block pointers may be committed. * Note that this routine is always called from interrupt level * with further interrupts from this device blocked. * * If the write did not succeed, we will do all the roll-forward * operations, but we will not take the actions that will allow its * dependencies to be processed. */ static int handle_written_filepage(pagedep, bp, flags) struct pagedep *pagedep; struct buf *bp; /* buffer containing the written page */ int flags; { struct dirrem *dirrem; struct diradd *dap, *nextdap; struct direct *ep; int i, chgs; if ((pagedep->pd_state & IOSTARTED) == 0) panic("handle_written_filepage: not started"); pagedep->pd_state &= ~IOSTARTED; if ((flags & WRITESUCCEEDED) == 0) goto rollforward; /* * Process any directory removals that have been committed. */ while ((dirrem = LIST_FIRST(&pagedep->pd_dirremhd)) != NULL) { LIST_REMOVE(dirrem, dm_next); dirrem->dm_state |= COMPLETE; dirrem->dm_dirinum = pagedep->pd_ino; KASSERT(LIST_EMPTY(&dirrem->dm_jremrefhd), ("handle_written_filepage: Journal entries not written.")); add_to_worklist(&dirrem->dm_list, 0); } /* * Free any directory additions that have been committed. * If it is a newly allocated block, we have to wait until * the on-disk directory inode claims the new block. */ if ((pagedep->pd_state & NEWBLOCK) == 0) while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != NULL) free_diradd(dap, NULL); rollforward: /* * Uncommitted directory entries must be restored. */ for (chgs = 0, i = 0; i < DAHASHSZ; i++) { for (dap = LIST_FIRST(&pagedep->pd_diraddhd[i]); dap; dap = nextdap) { nextdap = LIST_NEXT(dap, da_pdlist); if (dap->da_state & ATTACHED) panic("handle_written_filepage: attached"); ep = (struct direct *) ((char *)bp->b_data + dap->da_offset); ep->d_ino = dap->da_newinum; dap->da_state &= ~UNDONE; dap->da_state |= ATTACHED; chgs = 1; /* * If the inode referenced by the directory has * been written out, then the dependency can be * moved to the pending list. */ if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) { LIST_REMOVE(dap, da_pdlist); LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist); } } } /* * If there were any rollbacks in the directory, then it must be * marked dirty so that its will eventually get written back in * its correct form. */ if (chgs || (flags & WRITESUCCEEDED) == 0) { if ((bp->b_flags & B_DELWRI) == 0) stat_dir_entry++; bdirty(bp); return (1); } /* * If we are not waiting for a new directory block to be * claimed by its inode, then the pagedep will be freed. * Otherwise it will remain to track any new entries on * the page in case they are fsync'ed. */ free_pagedep(pagedep); return (0); } /* * Writing back in-core inode structures. * * The filesystem only accesses an inode's contents when it occupies an * "in-core" inode structure. These "in-core" structures are separate from * the page frames used to cache inode blocks. Only the latter are * transferred to/from the disk. So, when the updated contents of the * "in-core" inode structure are copied to the corresponding in-memory inode * block, the dependencies are also transferred. The following procedure is * called when copying a dirty "in-core" inode to a cached inode block. */ /* * Called when an inode is loaded from disk. If the effective link count * differed from the actual link count when it was last flushed, then we * need to ensure that the correct effective link count is put back. */ void softdep_load_inodeblock(ip) struct inode *ip; /* the "in_core" copy of the inode */ { struct inodedep *inodedep; struct ufsmount *ump; ump = ITOUMP(ip); KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(ump)) != 0, ("softdep_load_inodeblock called on non-softdep filesystem")); /* * Check for alternate nlink count. */ ip->i_effnlink = ip->i_nlink; ACQUIRE_LOCK(ump); if (inodedep_lookup(UFSTOVFS(ump), ip->i_number, 0, &inodedep) == 0) { FREE_LOCK(ump); return; } ip->i_effnlink -= inodedep->id_nlinkdelta; FREE_LOCK(ump); } /* * This routine is called just before the "in-core" inode * information is to be copied to the in-memory inode block. * Recall that an inode block contains several inodes. If * the force flag is set, then the dependencies will be * cleared so that the update can always be made. Note that * the buffer is locked when this routine is called, so we * will never be in the middle of writing the inode block * to disk. */ void softdep_update_inodeblock(ip, bp, waitfor) struct inode *ip; /* the "in_core" copy of the inode */ struct buf *bp; /* the buffer containing the inode block */ int waitfor; /* nonzero => update must be allowed */ { struct inodedep *inodedep; struct inoref *inoref; struct ufsmount *ump; struct worklist *wk; struct mount *mp; struct buf *ibp; struct fs *fs; int error; ump = ITOUMP(ip); mp = UFSTOVFS(ump); KASSERT(MOUNTEDSOFTDEP(mp) != 0, ("softdep_update_inodeblock called on non-softdep filesystem")); fs = ump->um_fs; /* * Preserve the freelink that is on disk. clear_unlinked_inodedep() * does not have access to the in-core ip so must write directly into * the inode block buffer when setting freelink. */ if (fs->fs_magic == FS_UFS1_MAGIC) DIP_SET(ip, i_freelink, ((struct ufs1_dinode *)bp->b_data + ino_to_fsbo(fs, ip->i_number))->di_freelink); else DIP_SET(ip, i_freelink, ((struct ufs2_dinode *)bp->b_data + ino_to_fsbo(fs, ip->i_number))->di_freelink); /* * If the effective link count is not equal to the actual link * count, then we must track the difference in an inodedep while * the inode is (potentially) tossed out of the cache. Otherwise, * if there is no existing inodedep, then there are no dependencies * to track. */ ACQUIRE_LOCK(ump); again: if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0) { FREE_LOCK(ump); if (ip->i_effnlink != ip->i_nlink) panic("softdep_update_inodeblock: bad link count"); return; } if (inodedep->id_nlinkdelta != ip->i_nlink - ip->i_effnlink) panic("softdep_update_inodeblock: bad delta"); /* * If we're flushing all dependencies we must also move any waiting * for journal writes onto the bufwait list prior to I/O. */ if (waitfor) { TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) { if ((inoref->if_state & (DEPCOMPLETE | GOINGAWAY)) == DEPCOMPLETE) { jwait(&inoref->if_list, MNT_WAIT); goto again; } } } /* * Changes have been initiated. Anything depending on these * changes cannot occur until this inode has been written. */ inodedep->id_state &= ~COMPLETE; if ((inodedep->id_state & ONWORKLIST) == 0) WORKLIST_INSERT(&bp->b_dep, &inodedep->id_list); /* * Any new dependencies associated with the incore inode must * now be moved to the list associated with the buffer holding * the in-memory copy of the inode. Once merged process any * allocdirects that are completed by the merger. */ merge_inode_lists(&inodedep->id_newinoupdt, &inodedep->id_inoupdt); if (!TAILQ_EMPTY(&inodedep->id_inoupdt)) handle_allocdirect_partdone(TAILQ_FIRST(&inodedep->id_inoupdt), NULL); merge_inode_lists(&inodedep->id_newextupdt, &inodedep->id_extupdt); if (!TAILQ_EMPTY(&inodedep->id_extupdt)) handle_allocdirect_partdone(TAILQ_FIRST(&inodedep->id_extupdt), NULL); /* * Now that the inode has been pushed into the buffer, the * operations dependent on the inode being written to disk * can be moved to the id_bufwait so that they will be * processed when the buffer I/O completes. */ while ((wk = LIST_FIRST(&inodedep->id_inowait)) != NULL) { WORKLIST_REMOVE(wk); WORKLIST_INSERT(&inodedep->id_bufwait, wk); } /* * Newly allocated inodes cannot be written until the bitmap * that allocates them have been written (indicated by * DEPCOMPLETE being set in id_state). If we are doing a * forced sync (e.g., an fsync on a file), we force the bitmap * to be written so that the update can be done. */ if (waitfor == 0) { FREE_LOCK(ump); return; } retry: if ((inodedep->id_state & (DEPCOMPLETE | GOINGAWAY)) != 0) { FREE_LOCK(ump); return; } ibp = inodedep->id_bmsafemap->sm_buf; ibp = getdirtybuf(ibp, LOCK_PTR(ump), MNT_WAIT); if (ibp == NULL) { /* * If ibp came back as NULL, the dependency could have been * freed while we slept. Look it up again, and check to see * that it has completed. */ if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) != 0) goto retry; FREE_LOCK(ump); return; } FREE_LOCK(ump); if ((error = bwrite(ibp)) != 0) softdep_error("softdep_update_inodeblock: bwrite", error); } /* * Merge the a new inode dependency list (such as id_newinoupdt) into an * old inode dependency list (such as id_inoupdt). This routine must be * called with splbio interrupts blocked. */ static void merge_inode_lists(newlisthead, oldlisthead) struct allocdirectlst *newlisthead; struct allocdirectlst *oldlisthead; { struct allocdirect *listadp, *newadp; newadp = TAILQ_FIRST(newlisthead); for (listadp = TAILQ_FIRST(oldlisthead); listadp && newadp;) { if (listadp->ad_offset < newadp->ad_offset) { listadp = TAILQ_NEXT(listadp, ad_next); continue; } TAILQ_REMOVE(newlisthead, newadp, ad_next); TAILQ_INSERT_BEFORE(listadp, newadp, ad_next); if (listadp->ad_offset == newadp->ad_offset) { allocdirect_merge(oldlisthead, newadp, listadp); listadp = newadp; } newadp = TAILQ_FIRST(newlisthead); } while ((newadp = TAILQ_FIRST(newlisthead)) != NULL) { TAILQ_REMOVE(newlisthead, newadp, ad_next); TAILQ_INSERT_TAIL(oldlisthead, newadp, ad_next); } } /* * If we are doing an fsync, then we must ensure that any directory * entries for the inode have been written after the inode gets to disk. */ int softdep_fsync(vp) struct vnode *vp; /* the "in_core" copy of the inode */ { struct inodedep *inodedep; struct pagedep *pagedep; struct inoref *inoref; struct ufsmount *ump; struct worklist *wk; struct diradd *dap; struct mount *mp; struct vnode *pvp; struct inode *ip; struct buf *bp; struct fs *fs; struct thread *td = curthread; int error, flushparent, pagedep_new_block; ino_t parentino; ufs_lbn_t lbn; ip = VTOI(vp); mp = vp->v_mount; ump = VFSTOUFS(mp); fs = ump->um_fs; if (MOUNTEDSOFTDEP(mp) == 0) return (0); ACQUIRE_LOCK(ump); restart: if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0) { FREE_LOCK(ump); return (0); } TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) { if ((inoref->if_state & (DEPCOMPLETE | GOINGAWAY)) == DEPCOMPLETE) { jwait(&inoref->if_list, MNT_WAIT); goto restart; } } if (!LIST_EMPTY(&inodedep->id_inowait) || !TAILQ_EMPTY(&inodedep->id_extupdt) || !TAILQ_EMPTY(&inodedep->id_newextupdt) || !TAILQ_EMPTY(&inodedep->id_inoupdt) || !TAILQ_EMPTY(&inodedep->id_newinoupdt)) panic("softdep_fsync: pending ops %p", inodedep); for (error = 0, flushparent = 0; ; ) { if ((wk = LIST_FIRST(&inodedep->id_pendinghd)) == NULL) break; if (wk->wk_type != D_DIRADD) panic("softdep_fsync: Unexpected type %s", TYPENAME(wk->wk_type)); dap = WK_DIRADD(wk); /* * Flush our parent if this directory entry has a MKDIR_PARENT * dependency or is contained in a newly allocated block. */ if (dap->da_state & DIRCHG) pagedep = dap->da_previous->dm_pagedep; else pagedep = dap->da_pagedep; parentino = pagedep->pd_ino; lbn = pagedep->pd_lbn; if ((dap->da_state & (MKDIR_BODY | COMPLETE)) != COMPLETE) panic("softdep_fsync: dirty"); if ((dap->da_state & MKDIR_PARENT) || (pagedep->pd_state & NEWBLOCK)) flushparent = 1; else flushparent = 0; /* * If we are being fsync'ed as part of vgone'ing this vnode, * then we will not be able to release and recover the * vnode below, so we just have to give up on writing its * directory entry out. It will eventually be written, just * not now, but then the user was not asking to have it * written, so we are not breaking any promises. */ if (vp->v_iflag & VI_DOOMED) break; /* * We prevent deadlock by always fetching inodes from the * root, moving down the directory tree. Thus, when fetching * our parent directory, we first try to get the lock. If * that fails, we must unlock ourselves before requesting * the lock on our parent. See the comment in ufs_lookup * for details on possible races. */ FREE_LOCK(ump); if (ffs_vgetf(mp, parentino, LK_NOWAIT | LK_EXCLUSIVE, &pvp, FFSV_FORCEINSMQ)) { error = vfs_busy(mp, MBF_NOWAIT); if (error != 0) { vfs_ref(mp); VOP_UNLOCK(vp, 0); error = vfs_busy(mp, 0); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); vfs_rel(mp); if (error != 0) return (ENOENT); if (vp->v_iflag & VI_DOOMED) { vfs_unbusy(mp); return (ENOENT); } } VOP_UNLOCK(vp, 0); error = ffs_vgetf(mp, parentino, LK_EXCLUSIVE, &pvp, FFSV_FORCEINSMQ); vfs_unbusy(mp); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); if (vp->v_iflag & VI_DOOMED) { if (error == 0) vput(pvp); error = ENOENT; } if (error != 0) return (error); } /* * All MKDIR_PARENT dependencies and all the NEWBLOCK pagedeps * that are contained in direct blocks will be resolved by * doing a ffs_update. Pagedeps contained in indirect blocks * may require a complete sync'ing of the directory. So, we * try the cheap and fast ffs_update first, and if that fails, * then we do the slower ffs_syncvnode of the directory. */ if (flushparent) { int locked; if ((error = ffs_update(pvp, 1)) != 0) { vput(pvp); return (error); } ACQUIRE_LOCK(ump); locked = 1; if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) != 0) { if ((wk = LIST_FIRST(&inodedep->id_pendinghd)) != NULL) { if (wk->wk_type != D_DIRADD) panic("softdep_fsync: Unexpected type %s", TYPENAME(wk->wk_type)); dap = WK_DIRADD(wk); if (dap->da_state & DIRCHG) pagedep = dap->da_previous->dm_pagedep; else pagedep = dap->da_pagedep; pagedep_new_block = pagedep->pd_state & NEWBLOCK; FREE_LOCK(ump); locked = 0; if (pagedep_new_block && (error = ffs_syncvnode(pvp, MNT_WAIT, 0))) { vput(pvp); return (error); } } } if (locked) FREE_LOCK(ump); } /* * Flush directory page containing the inode's name. */ error = bread(pvp, lbn, blksize(fs, VTOI(pvp), lbn), td->td_ucred, &bp); if (error == 0) error = bwrite(bp); else brelse(bp); vput(pvp); if (error != 0) return (error); ACQUIRE_LOCK(ump); if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0) break; } FREE_LOCK(ump); return (0); } /* * Flush all the dirty bitmaps associated with the block device * before flushing the rest of the dirty blocks so as to reduce * the number of dependencies that will have to be rolled back. * * XXX Unused? */ void softdep_fsync_mountdev(vp) struct vnode *vp; { struct buf *bp, *nbp; struct worklist *wk; struct bufobj *bo; if (!vn_isdisk(vp, NULL)) panic("softdep_fsync_mountdev: vnode not a disk"); bo = &vp->v_bufobj; restart: BO_LOCK(bo); TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) { /* * If it is already scheduled, skip to the next buffer. */ if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL)) continue; if ((bp->b_flags & B_DELWRI) == 0) panic("softdep_fsync_mountdev: not dirty"); /* * We are only interested in bitmaps with outstanding * dependencies. */ if ((wk = LIST_FIRST(&bp->b_dep)) == NULL || wk->wk_type != D_BMSAFEMAP || (bp->b_vflags & BV_BKGRDINPROG)) { BUF_UNLOCK(bp); continue; } BO_UNLOCK(bo); bremfree(bp); (void) bawrite(bp); goto restart; } drain_output(vp); BO_UNLOCK(bo); } /* * Sync all cylinder groups that were dirty at the time this function is * called. Newly dirtied cgs will be inserted before the sentinel. This * is used to flush freedep activity that may be holding up writes to a * indirect block. */ static int sync_cgs(mp, waitfor) struct mount *mp; int waitfor; { struct bmsafemap *bmsafemap; struct bmsafemap *sentinel; struct ufsmount *ump; struct buf *bp; int error; sentinel = malloc(sizeof(*sentinel), M_BMSAFEMAP, M_ZERO | M_WAITOK); sentinel->sm_cg = -1; ump = VFSTOUFS(mp); error = 0; ACQUIRE_LOCK(ump); LIST_INSERT_HEAD(&ump->softdep_dirtycg, sentinel, sm_next); for (bmsafemap = LIST_NEXT(sentinel, sm_next); bmsafemap != NULL; bmsafemap = LIST_NEXT(sentinel, sm_next)) { /* Skip sentinels and cgs with no work to release. */ if (bmsafemap->sm_cg == -1 || (LIST_EMPTY(&bmsafemap->sm_freehd) && LIST_EMPTY(&bmsafemap->sm_freewr))) { LIST_REMOVE(sentinel, sm_next); LIST_INSERT_AFTER(bmsafemap, sentinel, sm_next); continue; } /* * If we don't get the lock and we're waiting try again, if * not move on to the next buf and try to sync it. */ bp = getdirtybuf(bmsafemap->sm_buf, LOCK_PTR(ump), waitfor); if (bp == NULL && waitfor == MNT_WAIT) continue; LIST_REMOVE(sentinel, sm_next); LIST_INSERT_AFTER(bmsafemap, sentinel, sm_next); if (bp == NULL) continue; FREE_LOCK(ump); if (waitfor == MNT_NOWAIT) bawrite(bp); else error = bwrite(bp); ACQUIRE_LOCK(ump); if (error) break; } LIST_REMOVE(sentinel, sm_next); FREE_LOCK(ump); free(sentinel, M_BMSAFEMAP); return (error); } /* * This routine is called when we are trying to synchronously flush a * file. This routine must eliminate any filesystem metadata dependencies * so that the syncing routine can succeed. */ int softdep_sync_metadata(struct vnode *vp) { struct inode *ip; int error; ip = VTOI(vp); KASSERT(MOUNTEDSOFTDEP(vp->v_mount) != 0, ("softdep_sync_metadata called on non-softdep filesystem")); /* * Ensure that any direct block dependencies have been cleared, * truncations are started, and inode references are journaled. */ ACQUIRE_LOCK(VFSTOUFS(vp->v_mount)); /* * Write all journal records to prevent rollbacks on devvp. */ if (vp->v_type == VCHR) softdep_flushjournal(vp->v_mount); error = flush_inodedep_deps(vp, vp->v_mount, ip->i_number); /* * Ensure that all truncates are written so we won't find deps on * indirect blocks. */ process_truncates(vp); FREE_LOCK(VFSTOUFS(vp->v_mount)); return (error); } /* * This routine is called when we are attempting to sync a buf with * dependencies. If waitfor is MNT_NOWAIT it attempts to schedule any * other IO it can but returns EBUSY if the buffer is not yet able to * be written. Dependencies which will not cause rollbacks will always * return 0. */ int softdep_sync_buf(struct vnode *vp, struct buf *bp, int waitfor) { struct indirdep *indirdep; struct pagedep *pagedep; struct allocindir *aip; struct newblk *newblk; struct ufsmount *ump; struct buf *nbp; struct worklist *wk; int i, error; KASSERT(MOUNTEDSOFTDEP(vp->v_mount) != 0, ("softdep_sync_buf called on non-softdep filesystem")); /* * For VCHR we just don't want to force flush any dependencies that * will cause rollbacks. */ if (vp->v_type == VCHR) { if (waitfor == MNT_NOWAIT && softdep_count_dependencies(bp, 0)) return (EBUSY); return (0); } ump = VFSTOUFS(vp->v_mount); ACQUIRE_LOCK(ump); /* * As we hold the buffer locked, none of its dependencies * will disappear. */ error = 0; top: LIST_FOREACH(wk, &bp->b_dep, wk_list) { switch (wk->wk_type) { case D_ALLOCDIRECT: case D_ALLOCINDIR: newblk = WK_NEWBLK(wk); if (newblk->nb_jnewblk != NULL) { if (waitfor == MNT_NOWAIT) { error = EBUSY; goto out_unlock; } jwait(&newblk->nb_jnewblk->jn_list, waitfor); goto top; } if (newblk->nb_state & DEPCOMPLETE || waitfor == MNT_NOWAIT) continue; nbp = newblk->nb_bmsafemap->sm_buf; nbp = getdirtybuf(nbp, LOCK_PTR(ump), waitfor); if (nbp == NULL) goto top; FREE_LOCK(ump); if ((error = bwrite(nbp)) != 0) goto out; ACQUIRE_LOCK(ump); continue; case D_INDIRDEP: indirdep = WK_INDIRDEP(wk); if (waitfor == MNT_NOWAIT) { if (!TAILQ_EMPTY(&indirdep->ir_trunc) || !LIST_EMPTY(&indirdep->ir_deplisthd)) { error = EBUSY; goto out_unlock; } } if (!TAILQ_EMPTY(&indirdep->ir_trunc)) panic("softdep_sync_buf: truncation pending."); restart: LIST_FOREACH(aip, &indirdep->ir_deplisthd, ai_next) { newblk = (struct newblk *)aip; if (newblk->nb_jnewblk != NULL) { jwait(&newblk->nb_jnewblk->jn_list, waitfor); goto restart; } if (newblk->nb_state & DEPCOMPLETE) continue; nbp = newblk->nb_bmsafemap->sm_buf; nbp = getdirtybuf(nbp, LOCK_PTR(ump), waitfor); if (nbp == NULL) goto restart; FREE_LOCK(ump); if ((error = bwrite(nbp)) != 0) goto out; ACQUIRE_LOCK(ump); goto restart; } continue; case D_PAGEDEP: /* * Only flush directory entries in synchronous passes. */ if (waitfor != MNT_WAIT) { error = EBUSY; goto out_unlock; } /* * While syncing snapshots, we must allow recursive * lookups. */ BUF_AREC(bp); /* * We are trying to sync a directory that may * have dependencies on both its own metadata * and/or dependencies on the inodes of any * recently allocated files. We walk its diradd * lists pushing out the associated inode. */ pagedep = WK_PAGEDEP(wk); for (i = 0; i < DAHASHSZ; i++) { if (LIST_FIRST(&pagedep->pd_diraddhd[i]) == 0) continue; if ((error = flush_pagedep_deps(vp, wk->wk_mp, &pagedep->pd_diraddhd[i]))) { BUF_NOREC(bp); goto out_unlock; } } BUF_NOREC(bp); continue; case D_FREEWORK: case D_FREEDEP: case D_JSEGDEP: case D_JNEWBLK: continue; default: panic("softdep_sync_buf: Unknown type %s", TYPENAME(wk->wk_type)); /* NOTREACHED */ } } out_unlock: FREE_LOCK(ump); out: return (error); } /* * Flush the dependencies associated with an inodedep. * Called with splbio blocked. */ static int flush_inodedep_deps(vp, mp, ino) struct vnode *vp; struct mount *mp; ino_t ino; { struct inodedep *inodedep; struct inoref *inoref; struct ufsmount *ump; int error, waitfor; /* * This work is done in two passes. The first pass grabs most * of the buffers and begins asynchronously writing them. The * only way to wait for these asynchronous writes is to sleep * on the filesystem vnode which may stay busy for a long time * if the filesystem is active. So, instead, we make a second * pass over the dependencies blocking on each write. In the * usual case we will be blocking against a write that we * initiated, so when it is done the dependency will have been * resolved. Thus the second pass is expected to end quickly. * We give a brief window at the top of the loop to allow * any pending I/O to complete. */ ump = VFSTOUFS(mp); LOCK_OWNED(ump); for (error = 0, waitfor = MNT_NOWAIT; ; ) { if (error) return (error); FREE_LOCK(ump); ACQUIRE_LOCK(ump); restart: if (inodedep_lookup(mp, ino, 0, &inodedep) == 0) return (0); TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) { if ((inoref->if_state & (DEPCOMPLETE | GOINGAWAY)) == DEPCOMPLETE) { jwait(&inoref->if_list, MNT_WAIT); goto restart; } } if (flush_deplist(&inodedep->id_inoupdt, waitfor, &error) || flush_deplist(&inodedep->id_newinoupdt, waitfor, &error) || flush_deplist(&inodedep->id_extupdt, waitfor, &error) || flush_deplist(&inodedep->id_newextupdt, waitfor, &error)) continue; /* * If pass2, we are done, otherwise do pass 2. */ if (waitfor == MNT_WAIT) break; waitfor = MNT_WAIT; } /* * Try freeing inodedep in case all dependencies have been removed. */ if (inodedep_lookup(mp, ino, 0, &inodedep) != 0) (void) free_inodedep(inodedep); return (0); } /* * Flush an inode dependency list. * Called with splbio blocked. */ static int flush_deplist(listhead, waitfor, errorp) struct allocdirectlst *listhead; int waitfor; int *errorp; { struct allocdirect *adp; struct newblk *newblk; struct ufsmount *ump; struct buf *bp; if ((adp = TAILQ_FIRST(listhead)) == NULL) return (0); ump = VFSTOUFS(adp->ad_list.wk_mp); LOCK_OWNED(ump); TAILQ_FOREACH(adp, listhead, ad_next) { newblk = (struct newblk *)adp; if (newblk->nb_jnewblk != NULL) { jwait(&newblk->nb_jnewblk->jn_list, MNT_WAIT); return (1); } if (newblk->nb_state & DEPCOMPLETE) continue; bp = newblk->nb_bmsafemap->sm_buf; bp = getdirtybuf(bp, LOCK_PTR(ump), waitfor); if (bp == NULL) { if (waitfor == MNT_NOWAIT) continue; return (1); } FREE_LOCK(ump); if (waitfor == MNT_NOWAIT) bawrite(bp); else *errorp = bwrite(bp); ACQUIRE_LOCK(ump); return (1); } return (0); } /* * Flush dependencies associated with an allocdirect block. */ static int flush_newblk_dep(vp, mp, lbn) struct vnode *vp; struct mount *mp; ufs_lbn_t lbn; { struct newblk *newblk; struct ufsmount *ump; struct bufobj *bo; struct inode *ip; struct buf *bp; ufs2_daddr_t blkno; int error; error = 0; bo = &vp->v_bufobj; ip = VTOI(vp); blkno = DIP(ip, i_db[lbn]); if (blkno == 0) panic("flush_newblk_dep: Missing block"); ump = VFSTOUFS(mp); ACQUIRE_LOCK(ump); /* * Loop until all dependencies related to this block are satisfied. * We must be careful to restart after each sleep in case a write * completes some part of this process for us. */ for (;;) { if (newblk_lookup(mp, blkno, 0, &newblk) == 0) { FREE_LOCK(ump); break; } if (newblk->nb_list.wk_type != D_ALLOCDIRECT) panic("flush_newblk_deps: Bad newblk %p", newblk); /* * Flush the journal. */ if (newblk->nb_jnewblk != NULL) { jwait(&newblk->nb_jnewblk->jn_list, MNT_WAIT); continue; } /* * Write the bitmap dependency. */ if ((newblk->nb_state & DEPCOMPLETE) == 0) { bp = newblk->nb_bmsafemap->sm_buf; bp = getdirtybuf(bp, LOCK_PTR(ump), MNT_WAIT); if (bp == NULL) continue; FREE_LOCK(ump); error = bwrite(bp); if (error) break; ACQUIRE_LOCK(ump); continue; } /* * Write the buffer. */ FREE_LOCK(ump); BO_LOCK(bo); bp = gbincore(bo, lbn); if (bp != NULL) { error = BUF_LOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, BO_LOCKPTR(bo)); if (error == ENOLCK) { ACQUIRE_LOCK(ump); error = 0; continue; /* Slept, retry */ } if (error != 0) break; /* Failed */ if (bp->b_flags & B_DELWRI) { bremfree(bp); error = bwrite(bp); if (error) break; } else BUF_UNLOCK(bp); } else BO_UNLOCK(bo); /* * We have to wait for the direct pointers to * point at the newdirblk before the dependency * will go away. */ error = ffs_update(vp, 1); if (error) break; ACQUIRE_LOCK(ump); } return (error); } /* * Eliminate a pagedep dependency by flushing out all its diradd dependencies. * Called with splbio blocked. */ static int flush_pagedep_deps(pvp, mp, diraddhdp) struct vnode *pvp; struct mount *mp; struct diraddhd *diraddhdp; { struct inodedep *inodedep; struct inoref *inoref; struct ufsmount *ump; struct diradd *dap; struct vnode *vp; int error = 0; struct buf *bp; ino_t inum; struct diraddhd unfinished; LIST_INIT(&unfinished); ump = VFSTOUFS(mp); LOCK_OWNED(ump); restart: while ((dap = LIST_FIRST(diraddhdp)) != NULL) { /* * Flush ourselves if this directory entry * has a MKDIR_PARENT dependency. */ if (dap->da_state & MKDIR_PARENT) { FREE_LOCK(ump); if ((error = ffs_update(pvp, 1)) != 0) break; ACQUIRE_LOCK(ump); /* * If that cleared dependencies, go on to next. */ if (dap != LIST_FIRST(diraddhdp)) continue; /* * All MKDIR_PARENT dependencies and all the * NEWBLOCK pagedeps that are contained in direct * blocks were resolved by doing above ffs_update. * Pagedeps contained in indirect blocks may * require a complete sync'ing of the directory. * We are in the midst of doing a complete sync, * so if they are not resolved in this pass we * defer them for now as they will be sync'ed by * our caller shortly. */ LIST_REMOVE(dap, da_pdlist); LIST_INSERT_HEAD(&unfinished, dap, da_pdlist); continue; } /* * A newly allocated directory must have its "." and * ".." entries written out before its name can be * committed in its parent. */ inum = dap->da_newinum; if (inodedep_lookup(UFSTOVFS(ump), inum, 0, &inodedep) == 0) panic("flush_pagedep_deps: lost inode1"); /* * Wait for any pending journal adds to complete so we don't * cause rollbacks while syncing. */ TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) { if ((inoref->if_state & (DEPCOMPLETE | GOINGAWAY)) == DEPCOMPLETE) { jwait(&inoref->if_list, MNT_WAIT); goto restart; } } if (dap->da_state & MKDIR_BODY) { FREE_LOCK(ump); if ((error = ffs_vgetf(mp, inum, LK_EXCLUSIVE, &vp, FFSV_FORCEINSMQ))) break; error = flush_newblk_dep(vp, mp, 0); /* * If we still have the dependency we might need to * update the vnode to sync the new link count to * disk. */ if (error == 0 && dap == LIST_FIRST(diraddhdp)) error = ffs_update(vp, 1); vput(vp); if (error != 0) break; ACQUIRE_LOCK(ump); /* * If that cleared dependencies, go on to next. */ if (dap != LIST_FIRST(diraddhdp)) continue; if (dap->da_state & MKDIR_BODY) { inodedep_lookup(UFSTOVFS(ump), inum, 0, &inodedep); panic("flush_pagedep_deps: MKDIR_BODY " "inodedep %p dap %p vp %p", inodedep, dap, vp); } } /* * Flush the inode on which the directory entry depends. * Having accounted for MKDIR_PARENT and MKDIR_BODY above, * the only remaining dependency is that the updated inode * count must get pushed to disk. The inode has already * been pushed into its inode buffer (via VOP_UPDATE) at * the time of the reference count change. So we need only * locate that buffer, ensure that there will be no rollback * caused by a bitmap dependency, then write the inode buffer. */ retry: if (inodedep_lookup(UFSTOVFS(ump), inum, 0, &inodedep) == 0) panic("flush_pagedep_deps: lost inode"); /* * If the inode still has bitmap dependencies, * push them to disk. */ if ((inodedep->id_state & (DEPCOMPLETE | GOINGAWAY)) == 0) { bp = inodedep->id_bmsafemap->sm_buf; bp = getdirtybuf(bp, LOCK_PTR(ump), MNT_WAIT); if (bp == NULL) goto retry; FREE_LOCK(ump); if ((error = bwrite(bp)) != 0) break; ACQUIRE_LOCK(ump); if (dap != LIST_FIRST(diraddhdp)) continue; } /* * If the inode is still sitting in a buffer waiting * to be written or waiting for the link count to be * adjusted update it here to flush it to disk. */ if (dap == LIST_FIRST(diraddhdp)) { FREE_LOCK(ump); if ((error = ffs_vgetf(mp, inum, LK_EXCLUSIVE, &vp, FFSV_FORCEINSMQ))) break; error = ffs_update(vp, 1); vput(vp); if (error) break; ACQUIRE_LOCK(ump); } /* * If we have failed to get rid of all the dependencies * then something is seriously wrong. */ if (dap == LIST_FIRST(diraddhdp)) { inodedep_lookup(UFSTOVFS(ump), inum, 0, &inodedep); panic("flush_pagedep_deps: failed to flush " "inodedep %p ino %ju dap %p", inodedep, (uintmax_t)inum, dap); } } if (error) ACQUIRE_LOCK(ump); while ((dap = LIST_FIRST(&unfinished)) != NULL) { LIST_REMOVE(dap, da_pdlist); LIST_INSERT_HEAD(diraddhdp, dap, da_pdlist); } return (error); } /* * A large burst of file addition or deletion activity can drive the * memory load excessively high. First attempt to slow things down * using the techniques below. If that fails, this routine requests * the offending operations to fall back to running synchronously * until the memory load returns to a reasonable level. */ int softdep_slowdown(vp) struct vnode *vp; { struct ufsmount *ump; int jlow; int max_softdeps_hard; KASSERT(MOUNTEDSOFTDEP(vp->v_mount) != 0, ("softdep_slowdown called on non-softdep filesystem")); ump = VFSTOUFS(vp->v_mount); ACQUIRE_LOCK(ump); jlow = 0; /* * Check for journal space if needed. */ if (DOINGSUJ(vp)) { if (journal_space(ump, 0) == 0) jlow = 1; } /* * If the system is under its limits and our filesystem is * not responsible for more than our share of the usage and * we are not low on journal space, then no need to slow down. */ max_softdeps_hard = max_softdeps * 11 / 10; if (dep_current[D_DIRREM] < max_softdeps_hard / 2 && dep_current[D_INODEDEP] < max_softdeps_hard && dep_current[D_INDIRDEP] < max_softdeps_hard / 1000 && dep_current[D_FREEBLKS] < max_softdeps_hard && jlow == 0 && ump->softdep_curdeps[D_DIRREM] < (max_softdeps_hard / 2) / stat_flush_threads && ump->softdep_curdeps[D_INODEDEP] < max_softdeps_hard / stat_flush_threads && ump->softdep_curdeps[D_INDIRDEP] < (max_softdeps_hard / 1000) / stat_flush_threads && ump->softdep_curdeps[D_FREEBLKS] < max_softdeps_hard / stat_flush_threads) { FREE_LOCK(ump); return (0); } /* * If the journal is low or our filesystem is over its limit * then speedup the cleanup. */ if (ump->softdep_curdeps[D_INDIRDEP] < (max_softdeps_hard / 1000) / stat_flush_threads || jlow) softdep_speedup(ump); stat_sync_limit_hit += 1; FREE_LOCK(ump); /* * We only slow down the rate at which new dependencies are * generated if we are not using journaling. With journaling, * the cleanup should always be sufficient to keep things * under control. */ if (DOINGSUJ(vp)) return (0); return (1); } /* * Called by the allocation routines when they are about to fail * in the hope that we can free up the requested resource (inodes * or disk space). * * First check to see if the work list has anything on it. If it has, * clean up entries until we successfully free the requested resource. * Because this process holds inodes locked, we cannot handle any remove * requests that might block on a locked inode as that could lead to * deadlock. If the worklist yields none of the requested resource, * start syncing out vnodes to free up the needed space. */ int softdep_request_cleanup(fs, vp, cred, resource) struct fs *fs; struct vnode *vp; struct ucred *cred; int resource; { struct ufsmount *ump; struct mount *mp; struct vnode *lvp, *mvp; long starttime; ufs2_daddr_t needed; int error; /* * If we are being called because of a process doing a * copy-on-write, then it is not safe to process any * worklist items as we will recurse into the copyonwrite * routine. This will result in an incoherent snapshot. * If the vnode that we hold is a snapshot, we must avoid * handling other resources that could cause deadlock. */ if ((curthread->td_pflags & TDP_COWINPROGRESS) || IS_SNAPSHOT(VTOI(vp))) return (0); if (resource == FLUSH_BLOCKS_WAIT) stat_cleanup_blkrequests += 1; else stat_cleanup_inorequests += 1; mp = vp->v_mount; ump = VFSTOUFS(mp); mtx_assert(UFS_MTX(ump), MA_OWNED); UFS_UNLOCK(ump); error = ffs_update(vp, 1); if (error != 0 || MOUNTEDSOFTDEP(mp) == 0) { UFS_LOCK(ump); return (0); } /* * If we are in need of resources, start by cleaning up * any block removals associated with our inode. */ ACQUIRE_LOCK(ump); process_removes(vp); process_truncates(vp); FREE_LOCK(ump); /* * Now clean up at least as many resources as we will need. * * When requested to clean up inodes, the number that are needed * is set by the number of simultaneous writers (mnt_writeopcount) * plus a bit of slop (2) in case some more writers show up while * we are cleaning. * * When requested to free up space, the amount of space that * we need is enough blocks to allocate a full-sized segment * (fs_contigsumsize). The number of such segments that will * be needed is set by the number of simultaneous writers * (mnt_writeopcount) plus a bit of slop (2) in case some more * writers show up while we are cleaning. * * Additionally, if we are unpriviledged and allocating space, * we need to ensure that we clean up enough blocks to get the * needed number of blocks over the threshold of the minimum * number of blocks required to be kept free by the filesystem * (fs_minfree). */ if (resource == FLUSH_INODES_WAIT) { needed = vp->v_mount->mnt_writeopcount + 2; } else if (resource == FLUSH_BLOCKS_WAIT) { needed = (vp->v_mount->mnt_writeopcount + 2) * fs->fs_contigsumsize; if (priv_check_cred(cred, PRIV_VFS_BLOCKRESERVE, 0)) needed += fragstoblks(fs, roundup((fs->fs_dsize * fs->fs_minfree / 100) - fs->fs_cstotal.cs_nffree, fs->fs_frag)); } else { UFS_LOCK(ump); printf("softdep_request_cleanup: Unknown resource type %d\n", resource); return (0); } starttime = time_second; retry: if ((resource == FLUSH_BLOCKS_WAIT && ump->softdep_on_worklist > 0 && fs->fs_cstotal.cs_nbfree <= needed) || (resource == FLUSH_INODES_WAIT && fs->fs_pendinginodes > 0 && fs->fs_cstotal.cs_nifree <= needed)) { ACQUIRE_LOCK(ump); if (ump->softdep_on_worklist > 0 && process_worklist_item(UFSTOVFS(ump), ump->softdep_on_worklist, LK_NOWAIT) != 0) stat_worklist_push += 1; FREE_LOCK(ump); } /* * If we still need resources and there are no more worklist * entries to process to obtain them, we have to start flushing * the dirty vnodes to force the release of additional requests * to the worklist that we can then process to reap addition * resources. We walk the vnodes associated with the mount point * until we get the needed worklist requests that we can reap. */ if ((resource == FLUSH_BLOCKS_WAIT && fs->fs_cstotal.cs_nbfree <= needed) || (resource == FLUSH_INODES_WAIT && fs->fs_pendinginodes > 0 && fs->fs_cstotal.cs_nifree <= needed)) { MNT_VNODE_FOREACH_ALL(lvp, mp, mvp) { if (TAILQ_FIRST(&lvp->v_bufobj.bo_dirty.bv_hd) == 0) { VI_UNLOCK(lvp); continue; } if (vget(lvp, LK_EXCLUSIVE | LK_INTERLOCK | LK_NOWAIT, curthread)) continue; if (lvp->v_vflag & VV_NOSYNC) { /* unlinked */ vput(lvp); continue; } (void) ffs_syncvnode(lvp, MNT_NOWAIT, 0); vput(lvp); } lvp = ump->um_devvp; if (vn_lock(lvp, LK_EXCLUSIVE | LK_NOWAIT) == 0) { VOP_FSYNC(lvp, MNT_NOWAIT, curthread); VOP_UNLOCK(lvp, 0); } if (ump->softdep_on_worklist > 0) { stat_cleanup_retries += 1; goto retry; } stat_cleanup_failures += 1; } if (time_second - starttime > stat_cleanup_high_delay) stat_cleanup_high_delay = time_second - starttime; UFS_LOCK(ump); return (1); } static bool softdep_excess_items(struct ufsmount *ump, int item) { KASSERT(item >= 0 && item < D_LAST, ("item %d", item)); return (dep_current[item] > max_softdeps && ump->softdep_curdeps[item] > max_softdeps / stat_flush_threads); } static void schedule_cleanup(struct mount *mp) { struct ufsmount *ump; struct thread *td; ump = VFSTOUFS(mp); LOCK_OWNED(ump); FREE_LOCK(ump); td = curthread; if ((td->td_pflags & TDP_KTHREAD) != 0 && (td->td_proc->p_flag2 & P2_AST_SU) == 0) { /* * No ast is delivered to kernel threads, so nobody * would deref the mp. Some kernel threads * explicitely check for AST, e.g. NFS daemon does * this in the serving loop. */ return; } if (td->td_su != NULL) vfs_rel(td->td_su); vfs_ref(mp); td->td_su = mp; thread_lock(td); td->td_flags |= TDF_ASTPENDING; thread_unlock(td); } static void softdep_ast_cleanup_proc(void) { struct thread *td; struct mount *mp; struct ufsmount *ump; int error; bool req; td = curthread; while ((mp = td->td_su) != NULL) { td->td_su = NULL; error = vfs_busy(mp, MBF_NOWAIT); vfs_rel(mp); if (error != 0) return; if (ffs_own_mount(mp) && MOUNTEDSOFTDEP(mp)) { ump = VFSTOUFS(mp); for (;;) { req = false; ACQUIRE_LOCK(ump); if (softdep_excess_items(ump, D_INODEDEP)) { req = true; request_cleanup(mp, FLUSH_INODES); } if (softdep_excess_items(ump, D_DIRREM)) { req = true; request_cleanup(mp, FLUSH_BLOCKS); } FREE_LOCK(ump); if (softdep_excess_items(ump, D_NEWBLK) || softdep_excess_items(ump, D_ALLOCDIRECT) || softdep_excess_items(ump, D_ALLOCINDIR)) { error = vn_start_write(NULL, &mp, V_WAIT); if (error == 0) { req = true; VFS_SYNC(mp, MNT_WAIT); vn_finished_write(mp); } } if ((td->td_pflags & TDP_KTHREAD) != 0 || !req) break; } } vfs_unbusy(mp); } } /* * If memory utilization has gotten too high, deliberately slow things * down and speed up the I/O processing. */ static int request_cleanup(mp, resource) struct mount *mp; int resource; { struct thread *td = curthread; struct ufsmount *ump; ump = VFSTOUFS(mp); LOCK_OWNED(ump); /* * We never hold up the filesystem syncer or buf daemon. */ if (td->td_pflags & (TDP_SOFTDEP|TDP_NORUNNINGBUF)) return (0); /* * First check to see if the work list has gotten backlogged. * If it has, co-opt this process to help clean up two entries. * Because this process may hold inodes locked, we cannot * handle any remove requests that might block on a locked * inode as that could lead to deadlock. We set TDP_SOFTDEP * to avoid recursively processing the worklist. */ if (ump->softdep_on_worklist > max_softdeps / 10) { td->td_pflags |= TDP_SOFTDEP; process_worklist_item(mp, 2, LK_NOWAIT); td->td_pflags &= ~TDP_SOFTDEP; stat_worklist_push += 2; return(1); } /* * Next, we attempt to speed up the syncer process. If that * is successful, then we allow the process to continue. */ if (softdep_speedup(ump) && resource != FLUSH_BLOCKS_WAIT && resource != FLUSH_INODES_WAIT) return(0); /* * If we are resource constrained on inode dependencies, try * flushing some dirty inodes. Otherwise, we are constrained * by file deletions, so try accelerating flushes of directories * with removal dependencies. We would like to do the cleanup * here, but we probably hold an inode locked at this point and * that might deadlock against one that we try to clean. So, * the best that we can do is request the syncer daemon to do * the cleanup for us. */ switch (resource) { case FLUSH_INODES: case FLUSH_INODES_WAIT: ACQUIRE_GBLLOCK(&lk); stat_ino_limit_push += 1; req_clear_inodedeps += 1; FREE_GBLLOCK(&lk); stat_countp = &stat_ino_limit_hit; break; case FLUSH_BLOCKS: case FLUSH_BLOCKS_WAIT: ACQUIRE_GBLLOCK(&lk); stat_blk_limit_push += 1; req_clear_remove += 1; FREE_GBLLOCK(&lk); stat_countp = &stat_blk_limit_hit; break; default: panic("request_cleanup: unknown type"); } /* * Hopefully the syncer daemon will catch up and awaken us. * We wait at most tickdelay before proceeding in any case. */ ACQUIRE_GBLLOCK(&lk); FREE_LOCK(ump); proc_waiting += 1; if (callout_pending(&softdep_callout) == FALSE) callout_reset(&softdep_callout, tickdelay > 2 ? tickdelay : 2, pause_timer, 0); if ((td->td_pflags & TDP_KTHREAD) == 0) msleep((caddr_t)&proc_waiting, &lk, PPAUSE, "softupdate", 0); proc_waiting -= 1; FREE_GBLLOCK(&lk); ACQUIRE_LOCK(ump); return (1); } /* * Awaken processes pausing in request_cleanup and clear proc_waiting * to indicate that there is no longer a timer running. Pause_timer * will be called with the global softdep mutex (&lk) locked. */ static void pause_timer(arg) void *arg; { GBLLOCK_OWNED(&lk); /* * The callout_ API has acquired mtx and will hold it around this * function call. */ *stat_countp += proc_waiting; wakeup(&proc_waiting); } /* * If requested, try removing inode or removal dependencies. */ static void check_clear_deps(mp) struct mount *mp; { /* * If we are suspended, it may be because of our using * too many inodedeps, so help clear them out. */ if (MOUNTEDSUJ(mp) && VFSTOUFS(mp)->softdep_jblocks->jb_suspended) clear_inodedeps(mp); /* * General requests for cleanup of backed up dependencies */ ACQUIRE_GBLLOCK(&lk); if (req_clear_inodedeps) { req_clear_inodedeps -= 1; FREE_GBLLOCK(&lk); clear_inodedeps(mp); ACQUIRE_GBLLOCK(&lk); wakeup(&proc_waiting); } if (req_clear_remove) { req_clear_remove -= 1; FREE_GBLLOCK(&lk); clear_remove(mp); ACQUIRE_GBLLOCK(&lk); wakeup(&proc_waiting); } FREE_GBLLOCK(&lk); } /* * Flush out a directory with at least one removal dependency in an effort to * reduce the number of dirrem, freefile, and freeblks dependency structures. */ static void clear_remove(mp) struct mount *mp; { struct pagedep_hashhead *pagedephd; struct pagedep *pagedep; struct ufsmount *ump; struct vnode *vp; struct bufobj *bo; int error, cnt; ino_t ino; ump = VFSTOUFS(mp); LOCK_OWNED(ump); for (cnt = 0; cnt <= ump->pagedep_hash_size; cnt++) { pagedephd = &ump->pagedep_hashtbl[ump->pagedep_nextclean++]; if (ump->pagedep_nextclean > ump->pagedep_hash_size) ump->pagedep_nextclean = 0; LIST_FOREACH(pagedep, pagedephd, pd_hash) { if (LIST_EMPTY(&pagedep->pd_dirremhd)) continue; ino = pagedep->pd_ino; if (vn_start_write(NULL, &mp, V_NOWAIT) != 0) continue; FREE_LOCK(ump); /* * Let unmount clear deps */ error = vfs_busy(mp, MBF_NOWAIT); if (error != 0) goto finish_write; error = ffs_vgetf(mp, ino, LK_EXCLUSIVE, &vp, FFSV_FORCEINSMQ); vfs_unbusy(mp); if (error != 0) { softdep_error("clear_remove: vget", error); goto finish_write; } if ((error = ffs_syncvnode(vp, MNT_NOWAIT, 0))) softdep_error("clear_remove: fsync", error); bo = &vp->v_bufobj; BO_LOCK(bo); drain_output(vp); BO_UNLOCK(bo); vput(vp); finish_write: vn_finished_write(mp); ACQUIRE_LOCK(ump); return; } } } /* * Clear out a block of dirty inodes in an effort to reduce * the number of inodedep dependency structures. */ static void clear_inodedeps(mp) struct mount *mp; { struct inodedep_hashhead *inodedephd; struct inodedep *inodedep; struct ufsmount *ump; struct vnode *vp; struct fs *fs; int error, cnt; ino_t firstino, lastino, ino; ump = VFSTOUFS(mp); fs = ump->um_fs; LOCK_OWNED(ump); /* * Pick a random inode dependency to be cleared. * We will then gather up all the inodes in its block * that have dependencies and flush them out. */ for (cnt = 0; cnt <= ump->inodedep_hash_size; cnt++) { inodedephd = &ump->inodedep_hashtbl[ump->inodedep_nextclean++]; if (ump->inodedep_nextclean > ump->inodedep_hash_size) ump->inodedep_nextclean = 0; if ((inodedep = LIST_FIRST(inodedephd)) != NULL) break; } if (inodedep == NULL) return; /* * Find the last inode in the block with dependencies. */ firstino = rounddown2(inodedep->id_ino, INOPB(fs)); for (lastino = firstino + INOPB(fs) - 1; lastino > firstino; lastino--) if (inodedep_lookup(mp, lastino, 0, &inodedep) != 0) break; /* * Asynchronously push all but the last inode with dependencies. * Synchronously push the last inode with dependencies to ensure * that the inode block gets written to free up the inodedeps. */ for (ino = firstino; ino <= lastino; ino++) { if (inodedep_lookup(mp, ino, 0, &inodedep) == 0) continue; if (vn_start_write(NULL, &mp, V_NOWAIT) != 0) continue; FREE_LOCK(ump); error = vfs_busy(mp, MBF_NOWAIT); /* Let unmount clear deps */ if (error != 0) { vn_finished_write(mp); ACQUIRE_LOCK(ump); return; } if ((error = ffs_vgetf(mp, ino, LK_EXCLUSIVE, &vp, FFSV_FORCEINSMQ)) != 0) { softdep_error("clear_inodedeps: vget", error); vfs_unbusy(mp); vn_finished_write(mp); ACQUIRE_LOCK(ump); return; } vfs_unbusy(mp); if (ino == lastino) { if ((error = ffs_syncvnode(vp, MNT_WAIT, 0))) softdep_error("clear_inodedeps: fsync1", error); } else { if ((error = ffs_syncvnode(vp, MNT_NOWAIT, 0))) softdep_error("clear_inodedeps: fsync2", error); BO_LOCK(&vp->v_bufobj); drain_output(vp); BO_UNLOCK(&vp->v_bufobj); } vput(vp); vn_finished_write(mp); ACQUIRE_LOCK(ump); } } void softdep_buf_append(bp, wkhd) struct buf *bp; struct workhead *wkhd; { struct worklist *wk; struct ufsmount *ump; if ((wk = LIST_FIRST(wkhd)) == NULL) return; KASSERT(MOUNTEDSOFTDEP(wk->wk_mp) != 0, ("softdep_buf_append called on non-softdep filesystem")); ump = VFSTOUFS(wk->wk_mp); ACQUIRE_LOCK(ump); while ((wk = LIST_FIRST(wkhd)) != NULL) { WORKLIST_REMOVE(wk); WORKLIST_INSERT(&bp->b_dep, wk); } FREE_LOCK(ump); } void softdep_inode_append(ip, cred, wkhd) struct inode *ip; struct ucred *cred; struct workhead *wkhd; { struct buf *bp; struct fs *fs; struct ufsmount *ump; int error; ump = ITOUMP(ip); KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(ump)) != 0, ("softdep_inode_append called on non-softdep filesystem")); fs = ump->um_fs; error = bread(ump->um_devvp, fsbtodb(fs, ino_to_fsba(fs, ip->i_number)), (int)fs->fs_bsize, cred, &bp); if (error) { bqrelse(bp); softdep_freework(wkhd); return; } softdep_buf_append(bp, wkhd); bqrelse(bp); } void softdep_freework(wkhd) struct workhead *wkhd; { struct worklist *wk; struct ufsmount *ump; if ((wk = LIST_FIRST(wkhd)) == NULL) return; KASSERT(MOUNTEDSOFTDEP(wk->wk_mp) != 0, ("softdep_freework called on non-softdep filesystem")); ump = VFSTOUFS(wk->wk_mp); ACQUIRE_LOCK(ump); handle_jwork(wkhd); FREE_LOCK(ump); } /* * Function to determine if the buffer has outstanding dependencies * that will cause a roll-back if the buffer is written. If wantcount * is set, return number of dependencies, otherwise just yes or no. */ static int softdep_count_dependencies(bp, wantcount) struct buf *bp; int wantcount; { struct worklist *wk; struct ufsmount *ump; struct bmsafemap *bmsafemap; struct freework *freework; struct inodedep *inodedep; struct indirdep *indirdep; struct freeblks *freeblks; struct allocindir *aip; struct pagedep *pagedep; struct dirrem *dirrem; struct newblk *newblk; struct mkdir *mkdir; struct diradd *dap; int i, retval; retval = 0; if ((wk = LIST_FIRST(&bp->b_dep)) == NULL) return (0); ump = VFSTOUFS(wk->wk_mp); ACQUIRE_LOCK(ump); LIST_FOREACH(wk, &bp->b_dep, wk_list) { switch (wk->wk_type) { case D_INODEDEP: inodedep = WK_INODEDEP(wk); if ((inodedep->id_state & DEPCOMPLETE) == 0) { /* bitmap allocation dependency */ retval += 1; if (!wantcount) goto out; } if (TAILQ_FIRST(&inodedep->id_inoupdt)) { /* direct block pointer dependency */ retval += 1; if (!wantcount) goto out; } if (TAILQ_FIRST(&inodedep->id_extupdt)) { /* direct block pointer dependency */ retval += 1; if (!wantcount) goto out; } if (TAILQ_FIRST(&inodedep->id_inoreflst)) { /* Add reference dependency. */ retval += 1; if (!wantcount) goto out; } continue; case D_INDIRDEP: indirdep = WK_INDIRDEP(wk); TAILQ_FOREACH(freework, &indirdep->ir_trunc, fw_next) { /* indirect truncation dependency */ retval += 1; if (!wantcount) goto out; } LIST_FOREACH(aip, &indirdep->ir_deplisthd, ai_next) { /* indirect block pointer dependency */ retval += 1; if (!wantcount) goto out; } continue; case D_PAGEDEP: pagedep = WK_PAGEDEP(wk); LIST_FOREACH(dirrem, &pagedep->pd_dirremhd, dm_next) { if (LIST_FIRST(&dirrem->dm_jremrefhd)) { /* Journal remove ref dependency. */ retval += 1; if (!wantcount) goto out; } } for (i = 0; i < DAHASHSZ; i++) { LIST_FOREACH(dap, &pagedep->pd_diraddhd[i], da_pdlist) { /* directory entry dependency */ retval += 1; if (!wantcount) goto out; } } continue; case D_BMSAFEMAP: bmsafemap = WK_BMSAFEMAP(wk); if (LIST_FIRST(&bmsafemap->sm_jaddrefhd)) { /* Add reference dependency. */ retval += 1; if (!wantcount) goto out; } if (LIST_FIRST(&bmsafemap->sm_jnewblkhd)) { /* Allocate block dependency. */ retval += 1; if (!wantcount) goto out; } continue; case D_FREEBLKS: freeblks = WK_FREEBLKS(wk); if (LIST_FIRST(&freeblks->fb_jblkdephd)) { /* Freeblk journal dependency. */ retval += 1; if (!wantcount) goto out; } continue; case D_ALLOCDIRECT: case D_ALLOCINDIR: newblk = WK_NEWBLK(wk); if (newblk->nb_jnewblk) { /* Journal allocate dependency. */ retval += 1; if (!wantcount) goto out; } continue; case D_MKDIR: mkdir = WK_MKDIR(wk); if (mkdir->md_jaddref) { /* Journal reference dependency. */ retval += 1; if (!wantcount) goto out; } continue; case D_FREEWORK: case D_FREEDEP: case D_JSEGDEP: case D_JSEG: case D_SBDEP: /* never a dependency on these blocks */ continue; default: panic("softdep_count_dependencies: Unexpected type %s", TYPENAME(wk->wk_type)); /* NOTREACHED */ } } out: FREE_LOCK(ump); return retval; } /* * Acquire exclusive access to a buffer. * Must be called with a locked mtx parameter. * Return acquired buffer or NULL on failure. */ static struct buf * getdirtybuf(bp, lock, waitfor) struct buf *bp; struct rwlock *lock; int waitfor; { int error; if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0) { if (waitfor != MNT_WAIT) return (NULL); error = BUF_LOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, lock); /* * Even if we successfully acquire bp here, we have dropped * lock, which may violates our guarantee. */ if (error == 0) BUF_UNLOCK(bp); else if (error != ENOLCK) panic("getdirtybuf: inconsistent lock: %d", error); rw_wlock(lock); return (NULL); } if ((bp->b_vflags & BV_BKGRDINPROG) != 0) { if (lock != BO_LOCKPTR(bp->b_bufobj) && waitfor == MNT_WAIT) { rw_wunlock(lock); BO_LOCK(bp->b_bufobj); BUF_UNLOCK(bp); if ((bp->b_vflags & BV_BKGRDINPROG) != 0) { bp->b_vflags |= BV_BKGRDWAIT; msleep(&bp->b_xflags, BO_LOCKPTR(bp->b_bufobj), PRIBIO | PDROP, "getbuf", 0); } else BO_UNLOCK(bp->b_bufobj); rw_wlock(lock); return (NULL); } BUF_UNLOCK(bp); if (waitfor != MNT_WAIT) return (NULL); /* * The lock argument must be bp->b_vp's mutex in * this case. */ #ifdef DEBUG_VFS_LOCKS if (bp->b_vp->v_type != VCHR) ASSERT_BO_WLOCKED(bp->b_bufobj); #endif bp->b_vflags |= BV_BKGRDWAIT; rw_sleep(&bp->b_xflags, lock, PRIBIO, "getbuf", 0); return (NULL); } if ((bp->b_flags & B_DELWRI) == 0) { BUF_UNLOCK(bp); return (NULL); } bremfree(bp); return (bp); } /* * Check if it is safe to suspend the file system now. On entry, * the vnode interlock for devvp should be held. Return 0 with * the mount interlock held if the file system can be suspended now, * otherwise return EAGAIN with the mount interlock held. */ int softdep_check_suspend(struct mount *mp, struct vnode *devvp, int softdep_depcnt, int softdep_accdepcnt, int secondary_writes, int secondary_accwrites) { struct bufobj *bo; struct ufsmount *ump; struct inodedep *inodedep; int error, unlinked; bo = &devvp->v_bufobj; ASSERT_BO_WLOCKED(bo); /* * If we are not running with soft updates, then we need only * deal with secondary writes as we try to suspend. */ if (MOUNTEDSOFTDEP(mp) == 0) { MNT_ILOCK(mp); while (mp->mnt_secondary_writes != 0) { BO_UNLOCK(bo); msleep(&mp->mnt_secondary_writes, MNT_MTX(mp), (PUSER - 1) | PDROP, "secwr", 0); BO_LOCK(bo); MNT_ILOCK(mp); } /* * Reasons for needing more work before suspend: * - Dirty buffers on devvp. * - Secondary writes occurred after start of vnode sync loop */ error = 0; if (bo->bo_numoutput > 0 || bo->bo_dirty.bv_cnt > 0 || secondary_writes != 0 || mp->mnt_secondary_writes != 0 || secondary_accwrites != mp->mnt_secondary_accwrites) error = EAGAIN; BO_UNLOCK(bo); return (error); } /* * If we are running with soft updates, then we need to coordinate * with them as we try to suspend. */ ump = VFSTOUFS(mp); for (;;) { if (!TRY_ACQUIRE_LOCK(ump)) { BO_UNLOCK(bo); ACQUIRE_LOCK(ump); FREE_LOCK(ump); BO_LOCK(bo); continue; } MNT_ILOCK(mp); if (mp->mnt_secondary_writes != 0) { FREE_LOCK(ump); BO_UNLOCK(bo); msleep(&mp->mnt_secondary_writes, MNT_MTX(mp), (PUSER - 1) | PDROP, "secwr", 0); BO_LOCK(bo); continue; } break; } unlinked = 0; if (MOUNTEDSUJ(mp)) { for (inodedep = TAILQ_FIRST(&ump->softdep_unlinked); inodedep != NULL; inodedep = TAILQ_NEXT(inodedep, id_unlinked)) { if ((inodedep->id_state & (UNLINKED | UNLINKLINKS | UNLINKONLIST)) != (UNLINKED | UNLINKLINKS | UNLINKONLIST) || !check_inodedep_free(inodedep)) continue; unlinked++; } } /* * Reasons for needing more work before suspend: * - Dirty buffers on devvp. * - Softdep activity occurred after start of vnode sync loop * - Secondary writes occurred after start of vnode sync loop */ error = 0; if (bo->bo_numoutput > 0 || bo->bo_dirty.bv_cnt > 0 || softdep_depcnt != unlinked || ump->softdep_deps != unlinked || softdep_accdepcnt != ump->softdep_accdeps || secondary_writes != 0 || mp->mnt_secondary_writes != 0 || secondary_accwrites != mp->mnt_secondary_accwrites) error = EAGAIN; FREE_LOCK(ump); BO_UNLOCK(bo); return (error); } /* * Get the number of dependency structures for the file system, both * the current number and the total number allocated. These will * later be used to detect that softdep processing has occurred. */ void softdep_get_depcounts(struct mount *mp, int *softdep_depsp, int *softdep_accdepsp) { struct ufsmount *ump; if (MOUNTEDSOFTDEP(mp) == 0) { *softdep_depsp = 0; *softdep_accdepsp = 0; return; } ump = VFSTOUFS(mp); ACQUIRE_LOCK(ump); *softdep_depsp = ump->softdep_deps; *softdep_accdepsp = ump->softdep_accdeps; FREE_LOCK(ump); } /* * Wait for pending output on a vnode to complete. * Must be called with vnode lock and interlock locked. * * XXX: Should just be a call to bufobj_wwait(). */ static void drain_output(vp) struct vnode *vp; { struct bufobj *bo; bo = &vp->v_bufobj; ASSERT_VOP_LOCKED(vp, "drain_output"); ASSERT_BO_WLOCKED(bo); while (bo->bo_numoutput) { bo->bo_flag |= BO_WWAIT; msleep((caddr_t)&bo->bo_numoutput, BO_LOCKPTR(bo), PRIBIO + 1, "drainvp", 0); } } /* * Called whenever a buffer that is being invalidated or reallocated * contains dependencies. This should only happen if an I/O error has * occurred. The routine is called with the buffer locked. */ static void softdep_deallocate_dependencies(bp) struct buf *bp; { if ((bp->b_ioflags & BIO_ERROR) == 0) panic("softdep_deallocate_dependencies: dangling deps"); if (bp->b_vp != NULL && bp->b_vp->v_mount != NULL) softdep_error(bp->b_vp->v_mount->mnt_stat.f_mntonname, bp->b_error); else printf("softdep_deallocate_dependencies: " "got error %d while accessing filesystem\n", bp->b_error); if (bp->b_error != ENXIO) panic("softdep_deallocate_dependencies: unrecovered I/O error"); } /* * Function to handle asynchronous write errors in the filesystem. */ static void softdep_error(func, error) char *func; int error; { /* XXX should do something better! */ printf("%s: got error %d while accessing filesystem\n", func, error); } #ifdef DDB static void inodedep_print(struct inodedep *inodedep, int verbose) { db_printf("%p fs %p st %x ino %jd inoblk %jd delta %jd nlink %jd" " saveino %p\n", inodedep, inodedep->id_fs, inodedep->id_state, (intmax_t)inodedep->id_ino, (intmax_t)fsbtodb(inodedep->id_fs, ino_to_fsba(inodedep->id_fs, inodedep->id_ino)), (intmax_t)inodedep->id_nlinkdelta, (intmax_t)inodedep->id_savednlink, inodedep->id_savedino1); if (verbose == 0) return; db_printf("\tpendinghd %p, bufwait %p, inowait %p, inoreflst %p, " "mkdiradd %p\n", LIST_FIRST(&inodedep->id_pendinghd), LIST_FIRST(&inodedep->id_bufwait), LIST_FIRST(&inodedep->id_inowait), TAILQ_FIRST(&inodedep->id_inoreflst), inodedep->id_mkdiradd); db_printf("\tinoupdt %p, newinoupdt %p, extupdt %p, newextupdt %p\n", TAILQ_FIRST(&inodedep->id_inoupdt), TAILQ_FIRST(&inodedep->id_newinoupdt), TAILQ_FIRST(&inodedep->id_extupdt), TAILQ_FIRST(&inodedep->id_newextupdt)); } DB_SHOW_COMMAND(inodedep, db_show_inodedep) { if (have_addr == 0) { db_printf("Address required\n"); return; } inodedep_print((struct inodedep*)addr, 1); } DB_SHOW_COMMAND(inodedeps, db_show_inodedeps) { struct inodedep_hashhead *inodedephd; struct inodedep *inodedep; struct ufsmount *ump; int cnt; if (have_addr == 0) { db_printf("Address required\n"); return; } ump = (struct ufsmount *)addr; for (cnt = 0; cnt < ump->inodedep_hash_size; cnt++) { inodedephd = &ump->inodedep_hashtbl[cnt]; LIST_FOREACH(inodedep, inodedephd, id_hash) { inodedep_print(inodedep, 0); } } } DB_SHOW_COMMAND(worklist, db_show_worklist) { struct worklist *wk; if (have_addr == 0) { db_printf("Address required\n"); return; } wk = (struct worklist *)addr; printf("worklist: %p type %s state 0x%X\n", wk, TYPENAME(wk->wk_type), wk->wk_state); } DB_SHOW_COMMAND(workhead, db_show_workhead) { struct workhead *wkhd; struct worklist *wk; int i; if (have_addr == 0) { db_printf("Address required\n"); return; } wkhd = (struct workhead *)addr; wk = LIST_FIRST(wkhd); for (i = 0; i < 100 && wk != NULL; i++, wk = LIST_NEXT(wk, wk_list)) db_printf("worklist: %p type %s state 0x%X", wk, TYPENAME(wk->wk_type), wk->wk_state); if (i == 100) db_printf("workhead overflow"); printf("\n"); } DB_SHOW_COMMAND(mkdirs, db_show_mkdirs) { struct mkdirlist *mkdirlisthd; struct jaddref *jaddref; struct diradd *diradd; struct mkdir *mkdir; if (have_addr == 0) { db_printf("Address required\n"); return; } mkdirlisthd = (struct mkdirlist *)addr; LIST_FOREACH(mkdir, mkdirlisthd, md_mkdirs) { diradd = mkdir->md_diradd; db_printf("mkdir: %p state 0x%X dap %p state 0x%X", mkdir, mkdir->md_state, diradd, diradd->da_state); if ((jaddref = mkdir->md_jaddref) != NULL) db_printf(" jaddref %p jaddref state 0x%X", jaddref, jaddref->ja_state); db_printf("\n"); } } /* exported to ffs_vfsops.c */ extern void db_print_ffs(struct ufsmount *ump); void db_print_ffs(struct ufsmount *ump) { db_printf("mp %p %s devvp %p fs %p su_wl %d su_deps %d su_req %d\n", ump->um_mountp, ump->um_mountp->mnt_stat.f_mntonname, ump->um_devvp, ump->um_fs, ump->softdep_on_worklist, ump->softdep_deps, ump->softdep_req); } #endif /* DDB */ #endif /* SOFTUPDATES */ Index: head/sys/ufs/ffs/ffs_vfsops.c =================================================================== --- head/sys/ufs/ffs/ffs_vfsops.c (revision 313779) +++ head/sys/ufs/ffs/ffs_vfsops.c (revision 313780) @@ -1,2284 +1,2284 @@ /*- * Copyright (c) 1989, 1991, 1993, 1994 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ffs_vfsops.c 8.31 (Berkeley) 5/20/95 */ #include __FBSDID("$FreeBSD$"); #include "opt_quota.h" #include "opt_ufs.h" #include "opt_ffs.h" #include "opt_ddb.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static uma_zone_t uma_inode, uma_ufs1, uma_ufs2; static int ffs_mountfs(struct vnode *, struct mount *, struct thread *); static void ffs_oldfscompat_read(struct fs *, struct ufsmount *, ufs2_daddr_t); static void ffs_ifree(struct ufsmount *ump, struct inode *ip); static int ffs_sync_lazy(struct mount *mp); static vfs_init_t ffs_init; static vfs_uninit_t ffs_uninit; static vfs_extattrctl_t ffs_extattrctl; static vfs_cmount_t ffs_cmount; static vfs_unmount_t ffs_unmount; static vfs_mount_t ffs_mount; static vfs_statfs_t ffs_statfs; static vfs_fhtovp_t ffs_fhtovp; static vfs_sync_t ffs_sync; static struct vfsops ufs_vfsops = { .vfs_extattrctl = ffs_extattrctl, .vfs_fhtovp = ffs_fhtovp, .vfs_init = ffs_init, .vfs_mount = ffs_mount, .vfs_cmount = ffs_cmount, .vfs_quotactl = ufs_quotactl, .vfs_root = ufs_root, .vfs_statfs = ffs_statfs, .vfs_sync = ffs_sync, .vfs_uninit = ffs_uninit, .vfs_unmount = ffs_unmount, .vfs_vget = ffs_vget, .vfs_susp_clean = process_deferred_inactive, }; VFS_SET(ufs_vfsops, ufs, 0); MODULE_VERSION(ufs, 1); static b_strategy_t ffs_geom_strategy; static b_write_t ffs_bufwrite; static struct buf_ops ffs_ops = { .bop_name = "FFS", .bop_write = ffs_bufwrite, .bop_strategy = ffs_geom_strategy, .bop_sync = bufsync, #ifdef NO_FFS_SNAPSHOT .bop_bdflush = bufbdflush, #else .bop_bdflush = ffs_bdflush, #endif }; /* * Note that userquota and groupquota options are not currently used * by UFS/FFS code and generally mount(8) does not pass those options * from userland, but they can be passed by loader(8) via * vfs.root.mountfrom.options. */ static const char *ffs_opts[] = { "acls", "async", "noatime", "noclusterr", "noclusterw", "noexec", "export", "force", "from", "groupquota", "multilabel", "nfsv4acls", "fsckpid", "snapshot", "nosuid", "suiddir", "nosymfollow", "sync", "union", "userquota", NULL }; static int ffs_mount(struct mount *mp) { struct vnode *devvp; struct thread *td; struct ufsmount *ump = NULL; struct fs *fs; pid_t fsckpid = 0; int error, error1, flags; uint64_t mntorflags; accmode_t accmode; struct nameidata ndp; char *fspec; td = curthread; if (vfs_filteropt(mp->mnt_optnew, ffs_opts)) return (EINVAL); if (uma_inode == NULL) { uma_inode = uma_zcreate("FFS inode", sizeof(struct inode), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); uma_ufs1 = uma_zcreate("FFS1 dinode", sizeof(struct ufs1_dinode), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); uma_ufs2 = uma_zcreate("FFS2 dinode", sizeof(struct ufs2_dinode), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); } vfs_deleteopt(mp->mnt_optnew, "groupquota"); vfs_deleteopt(mp->mnt_optnew, "userquota"); fspec = vfs_getopts(mp->mnt_optnew, "from", &error); if (error) return (error); mntorflags = 0; if (vfs_getopt(mp->mnt_optnew, "acls", NULL, NULL) == 0) mntorflags |= MNT_ACLS; if (vfs_getopt(mp->mnt_optnew, "snapshot", NULL, NULL) == 0) { mntorflags |= MNT_SNAPSHOT; /* * Once we have set the MNT_SNAPSHOT flag, do not * persist "snapshot" in the options list. */ vfs_deleteopt(mp->mnt_optnew, "snapshot"); vfs_deleteopt(mp->mnt_opt, "snapshot"); } if (vfs_getopt(mp->mnt_optnew, "fsckpid", NULL, NULL) == 0 && vfs_scanopt(mp->mnt_optnew, "fsckpid", "%d", &fsckpid) == 1) { /* * Once we have set the restricted PID, do not * persist "fsckpid" in the options list. */ vfs_deleteopt(mp->mnt_optnew, "fsckpid"); vfs_deleteopt(mp->mnt_opt, "fsckpid"); if (mp->mnt_flag & MNT_UPDATE) { if (VFSTOUFS(mp)->um_fs->fs_ronly == 0 && vfs_flagopt(mp->mnt_optnew, "ro", NULL, 0) == 0) { vfs_mount_error(mp, "Checker enable: Must be read-only"); return (EINVAL); } } else if (vfs_flagopt(mp->mnt_optnew, "ro", NULL, 0) == 0) { vfs_mount_error(mp, "Checker enable: Must be read-only"); return (EINVAL); } /* Set to -1 if we are done */ if (fsckpid == 0) fsckpid = -1; } if (vfs_getopt(mp->mnt_optnew, "nfsv4acls", NULL, NULL) == 0) { if (mntorflags & MNT_ACLS) { vfs_mount_error(mp, "\"acls\" and \"nfsv4acls\" options " "are mutually exclusive"); return (EINVAL); } mntorflags |= MNT_NFS4ACLS; } MNT_ILOCK(mp); mp->mnt_flag |= mntorflags; MNT_IUNLOCK(mp); /* * If updating, check whether changing from read-only to * read/write; if there is no device name, that's all we do. */ if (mp->mnt_flag & MNT_UPDATE) { ump = VFSTOUFS(mp); fs = ump->um_fs; devvp = ump->um_devvp; if (fsckpid == -1 && ump->um_fsckpid > 0) { if ((error = ffs_flushfiles(mp, WRITECLOSE, td)) != 0 || (error = ffs_sbupdate(ump, MNT_WAIT, 0)) != 0) return (error); g_topology_lock(); /* * Return to normal read-only mode. */ error = g_access(ump->um_cp, 0, -1, 0); g_topology_unlock(); ump->um_fsckpid = 0; } if (fs->fs_ronly == 0 && vfs_flagopt(mp->mnt_optnew, "ro", NULL, 0)) { /* * Flush any dirty data and suspend filesystem. */ if ((error = vn_start_write(NULL, &mp, V_WAIT)) != 0) return (error); error = vfs_write_suspend_umnt(mp); if (error != 0) return (error); /* * Check for and optionally get rid of files open * for writing. */ flags = WRITECLOSE; if (mp->mnt_flag & MNT_FORCE) flags |= FORCECLOSE; if (MOUNTEDSOFTDEP(mp)) { error = softdep_flushfiles(mp, flags, td); } else { error = ffs_flushfiles(mp, flags, td); } if (error) { vfs_write_resume(mp, 0); return (error); } if (fs->fs_pendingblocks != 0 || fs->fs_pendinginodes != 0) { printf("WARNING: %s Update error: blocks %jd " "files %d\n", fs->fs_fsmnt, (intmax_t)fs->fs_pendingblocks, fs->fs_pendinginodes); fs->fs_pendingblocks = 0; fs->fs_pendinginodes = 0; } if ((fs->fs_flags & (FS_UNCLEAN | FS_NEEDSFSCK)) == 0) fs->fs_clean = 1; if ((error = ffs_sbupdate(ump, MNT_WAIT, 0)) != 0) { fs->fs_ronly = 0; fs->fs_clean = 0; vfs_write_resume(mp, 0); return (error); } if (MOUNTEDSOFTDEP(mp)) softdep_unmount(mp); g_topology_lock(); /* * Drop our write and exclusive access. */ g_access(ump->um_cp, 0, -1, -1); g_topology_unlock(); fs->fs_ronly = 1; MNT_ILOCK(mp); mp->mnt_flag |= MNT_RDONLY; MNT_IUNLOCK(mp); /* * Allow the writers to note that filesystem * is ro now. */ vfs_write_resume(mp, 0); } if ((mp->mnt_flag & MNT_RELOAD) && (error = ffs_reload(mp, td, 0)) != 0) return (error); if (fs->fs_ronly && !vfs_flagopt(mp->mnt_optnew, "ro", NULL, 0)) { /* * If we are running a checker, do not allow upgrade. */ if (ump->um_fsckpid > 0) { vfs_mount_error(mp, "Active checker, cannot upgrade to write"); return (EINVAL); } /* * If upgrade to read-write by non-root, then verify * that user has necessary permissions on the device. */ vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY); error = VOP_ACCESS(devvp, VREAD | VWRITE, td->td_ucred, td); if (error) error = priv_check(td, PRIV_VFS_MOUNT_PERM); if (error) { VOP_UNLOCK(devvp, 0); return (error); } VOP_UNLOCK(devvp, 0); fs->fs_flags &= ~FS_UNCLEAN; if (fs->fs_clean == 0) { fs->fs_flags |= FS_UNCLEAN; if ((mp->mnt_flag & MNT_FORCE) || ((fs->fs_flags & (FS_SUJ | FS_NEEDSFSCK)) == 0 && (fs->fs_flags & FS_DOSOFTDEP))) { printf("WARNING: %s was not properly " "dismounted\n", fs->fs_fsmnt); } else { vfs_mount_error(mp, "R/W mount of %s denied. %s.%s", fs->fs_fsmnt, "Filesystem is not clean - run fsck", (fs->fs_flags & FS_SUJ) == 0 ? "" : " Forced mount will invalidate" " journal contents"); return (EPERM); } } g_topology_lock(); /* * Request exclusive write access. */ error = g_access(ump->um_cp, 0, 1, 1); g_topology_unlock(); if (error) return (error); if ((error = vn_start_write(NULL, &mp, V_WAIT)) != 0) return (error); fs->fs_ronly = 0; MNT_ILOCK(mp); mp->mnt_flag &= ~MNT_RDONLY; MNT_IUNLOCK(mp); fs->fs_mtime = time_second; /* check to see if we need to start softdep */ if ((fs->fs_flags & FS_DOSOFTDEP) && (error = softdep_mount(devvp, mp, fs, td->td_ucred))){ vn_finished_write(mp); return (error); } fs->fs_clean = 0; if ((error = ffs_sbupdate(ump, MNT_WAIT, 0)) != 0) { vn_finished_write(mp); return (error); } if (fs->fs_snapinum[0] != 0) ffs_snapshot_mount(mp); vn_finished_write(mp); } /* * Soft updates is incompatible with "async", * so if we are doing softupdates stop the user * from setting the async flag in an update. * Softdep_mount() clears it in an initial mount * or ro->rw remount. */ if (MOUNTEDSOFTDEP(mp)) { /* XXX: Reset too late ? */ MNT_ILOCK(mp); mp->mnt_flag &= ~MNT_ASYNC; MNT_IUNLOCK(mp); } /* * Keep MNT_ACLS flag if it is stored in superblock. */ if ((fs->fs_flags & FS_ACLS) != 0) { /* XXX: Set too late ? */ MNT_ILOCK(mp); mp->mnt_flag |= MNT_ACLS; MNT_IUNLOCK(mp); } if ((fs->fs_flags & FS_NFS4ACLS) != 0) { /* XXX: Set too late ? */ MNT_ILOCK(mp); mp->mnt_flag |= MNT_NFS4ACLS; MNT_IUNLOCK(mp); } /* * If this is a request from fsck to clean up the filesystem, * then allow the specified pid to proceed. */ if (fsckpid > 0) { if (ump->um_fsckpid != 0) { vfs_mount_error(mp, "Active checker already running on %s", fs->fs_fsmnt); return (EINVAL); } KASSERT(MOUNTEDSOFTDEP(mp) == 0, ("soft updates enabled on read-only file system")); g_topology_lock(); /* * Request write access. */ error = g_access(ump->um_cp, 0, 1, 0); g_topology_unlock(); if (error) { vfs_mount_error(mp, "Checker activation failed on %s", fs->fs_fsmnt); return (error); } ump->um_fsckpid = fsckpid; if (fs->fs_snapinum[0] != 0) ffs_snapshot_mount(mp); fs->fs_mtime = time_second; fs->fs_fmod = 1; fs->fs_clean = 0; (void) ffs_sbupdate(ump, MNT_WAIT, 0); } /* * If this is a snapshot request, take the snapshot. */ if (mp->mnt_flag & MNT_SNAPSHOT) return (ffs_snapshot(mp, fspec)); /* * Must not call namei() while owning busy ref. */ vfs_unbusy(mp); } /* * Not an update, or updating the name: look up the name * and verify that it refers to a sensible disk device. */ NDINIT(&ndp, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, fspec, td); error = namei(&ndp); if ((mp->mnt_flag & MNT_UPDATE) != 0) { /* * Unmount does not start if MNT_UPDATE is set. Mount * update busies mp before setting MNT_UPDATE. We * must be able to retain our busy ref succesfully, * without sleep. */ error1 = vfs_busy(mp, MBF_NOWAIT); MPASS(error1 == 0); } if (error != 0) return (error); NDFREE(&ndp, NDF_ONLY_PNBUF); devvp = ndp.ni_vp; if (!vn_isdisk(devvp, &error)) { vput(devvp); return (error); } /* * If mount by non-root, then verify that user has necessary * permissions on the device. */ accmode = VREAD; if ((mp->mnt_flag & MNT_RDONLY) == 0) accmode |= VWRITE; error = VOP_ACCESS(devvp, accmode, td->td_ucred, td); if (error) error = priv_check(td, PRIV_VFS_MOUNT_PERM); if (error) { vput(devvp); return (error); } if (mp->mnt_flag & MNT_UPDATE) { /* * Update only * * If it's not the same vnode, or at least the same device * then it's not correct. */ if (devvp->v_rdev != ump->um_devvp->v_rdev) error = EINVAL; /* needs translation */ vput(devvp); if (error) return (error); } else { /* * New mount * * We need the name for the mount point (also used for * "last mounted on") copied in. If an error occurs, * the mount point is discarded by the upper level code. * Note that vfs_mount_alloc() populates f_mntonname for us. */ if ((error = ffs_mountfs(devvp, mp, td)) != 0) { vrele(devvp); return (error); } if (fsckpid > 0) { KASSERT(MOUNTEDSOFTDEP(mp) == 0, ("soft updates enabled on read-only file system")); ump = VFSTOUFS(mp); fs = ump->um_fs; g_topology_lock(); /* * Request write access. */ error = g_access(ump->um_cp, 0, 1, 0); g_topology_unlock(); if (error) { printf("WARNING: %s: Checker activation " "failed\n", fs->fs_fsmnt); } else { ump->um_fsckpid = fsckpid; if (fs->fs_snapinum[0] != 0) ffs_snapshot_mount(mp); fs->fs_mtime = time_second; fs->fs_clean = 0; (void) ffs_sbupdate(ump, MNT_WAIT, 0); } } } vfs_mountedfrom(mp, fspec); return (0); } /* * Compatibility with old mount system call. */ static int ffs_cmount(struct mntarg *ma, void *data, uint64_t flags) { struct ufs_args args; struct export_args exp; int error; if (data == NULL) return (EINVAL); error = copyin(data, &args, sizeof args); if (error) return (error); vfs_oexport_conv(&args.export, &exp); ma = mount_argsu(ma, "from", args.fspec, MAXPATHLEN); ma = mount_arg(ma, "export", &exp, sizeof(exp)); error = kernel_mount(ma, flags); return (error); } /* * Reload all incore data for a filesystem (used after running fsck on * the root filesystem and finding things to fix). If the 'force' flag * is 0, the filesystem must be mounted read-only. * * Things to do to update the mount: * 1) invalidate all cached meta-data. * 2) re-read superblock from disk. * 3) re-read summary information from disk. * 4) invalidate all inactive vnodes. * 5) clear MNTK_SUSPEND2 and MNTK_SUSPENDED flags, allowing secondary * writers, if requested. * 6) invalidate all cached file data. * 7) re-read inode data for all active vnodes. */ int ffs_reload(struct mount *mp, struct thread *td, int flags) { struct vnode *vp, *mvp, *devvp; struct inode *ip; void *space; struct buf *bp; struct fs *fs, *newfs; struct ufsmount *ump; ufs2_daddr_t sblockloc; int i, blks, error; u_long size; int32_t *lp; ump = VFSTOUFS(mp); MNT_ILOCK(mp); if ((mp->mnt_flag & MNT_RDONLY) == 0 && (flags & FFSR_FORCE) == 0) { MNT_IUNLOCK(mp); return (EINVAL); } MNT_IUNLOCK(mp); /* * Step 1: invalidate all cached meta-data. */ devvp = VFSTOUFS(mp)->um_devvp; vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY); if (vinvalbuf(devvp, 0, 0, 0) != 0) panic("ffs_reload: dirty1"); VOP_UNLOCK(devvp, 0); /* * Step 2: re-read superblock from disk. */ fs = VFSTOUFS(mp)->um_fs; if ((error = bread(devvp, btodb(fs->fs_sblockloc), fs->fs_sbsize, NOCRED, &bp)) != 0) return (error); newfs = (struct fs *)bp->b_data; if ((newfs->fs_magic != FS_UFS1_MAGIC && newfs->fs_magic != FS_UFS2_MAGIC) || newfs->fs_bsize > MAXBSIZE || newfs->fs_bsize < sizeof(struct fs)) { brelse(bp); return (EIO); /* XXX needs translation */ } /* * Copy pointer fields back into superblock before copying in XXX * new superblock. These should really be in the ufsmount. XXX * Note that important parameters (eg fs_ncg) are unchanged. */ newfs->fs_csp = fs->fs_csp; newfs->fs_maxcluster = fs->fs_maxcluster; newfs->fs_contigdirs = fs->fs_contigdirs; newfs->fs_active = fs->fs_active; newfs->fs_ronly = fs->fs_ronly; sblockloc = fs->fs_sblockloc; bcopy(newfs, fs, (u_int)fs->fs_sbsize); brelse(bp); mp->mnt_maxsymlinklen = fs->fs_maxsymlinklen; ffs_oldfscompat_read(fs, VFSTOUFS(mp), sblockloc); UFS_LOCK(ump); if (fs->fs_pendingblocks != 0 || fs->fs_pendinginodes != 0) { printf("WARNING: %s: reload pending error: blocks %jd " "files %d\n", fs->fs_fsmnt, (intmax_t)fs->fs_pendingblocks, fs->fs_pendinginodes); fs->fs_pendingblocks = 0; fs->fs_pendinginodes = 0; } UFS_UNLOCK(ump); /* * Step 3: re-read summary information from disk. */ size = fs->fs_cssize; blks = howmany(size, fs->fs_fsize); if (fs->fs_contigsumsize > 0) size += fs->fs_ncg * sizeof(int32_t); size += fs->fs_ncg * sizeof(u_int8_t); free(fs->fs_csp, M_UFSMNT); space = malloc(size, M_UFSMNT, M_WAITOK); fs->fs_csp = space; for (i = 0; i < blks; i += fs->fs_frag) { size = fs->fs_bsize; if (i + fs->fs_frag > blks) size = (blks - i) * fs->fs_fsize; error = bread(devvp, fsbtodb(fs, fs->fs_csaddr + i), size, NOCRED, &bp); if (error) return (error); bcopy(bp->b_data, space, (u_int)size); space = (char *)space + size; brelse(bp); } /* * We no longer know anything about clusters per cylinder group. */ if (fs->fs_contigsumsize > 0) { fs->fs_maxcluster = lp = space; for (i = 0; i < fs->fs_ncg; i++) *lp++ = fs->fs_contigsumsize; space = lp; } size = fs->fs_ncg * sizeof(u_int8_t); fs->fs_contigdirs = (u_int8_t *)space; bzero(fs->fs_contigdirs, size); if ((flags & FFSR_UNSUSPEND) != 0) { MNT_ILOCK(mp); mp->mnt_kern_flag &= ~(MNTK_SUSPENDED | MNTK_SUSPEND2); wakeup(&mp->mnt_flag); MNT_IUNLOCK(mp); } loop: MNT_VNODE_FOREACH_ALL(vp, mp, mvp) { /* * Skip syncer vnode. */ if (vp->v_type == VNON) { VI_UNLOCK(vp); continue; } /* * Step 4: invalidate all cached file data. */ if (vget(vp, LK_EXCLUSIVE | LK_INTERLOCK, td)) { MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp); goto loop; } if (vinvalbuf(vp, 0, 0, 0)) panic("ffs_reload: dirty2"); /* * Step 5: re-read inode data for all active vnodes. */ ip = VTOI(vp); error = bread(devvp, fsbtodb(fs, ino_to_fsba(fs, ip->i_number)), (int)fs->fs_bsize, NOCRED, &bp); if (error) { VOP_UNLOCK(vp, 0); vrele(vp); MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp); return (error); } ffs_load_inode(bp, ip, fs, ip->i_number); ip->i_effnlink = ip->i_nlink; brelse(bp); VOP_UNLOCK(vp, 0); vrele(vp); } return (0); } /* * Possible superblock locations ordered from most to least likely. */ static int sblock_try[] = SBLOCKSEARCH; /* * Common code for mount and mountroot */ static int ffs_mountfs(devvp, mp, td) struct vnode *devvp; struct mount *mp; struct thread *td; { struct ufsmount *ump; struct buf *bp; struct fs *fs; struct cdev *dev; void *space; ufs2_daddr_t sblockloc; int error, i, blks, len, ronly; u_long size; int32_t *lp; struct ucred *cred; struct g_consumer *cp; struct mount *nmp; bp = NULL; ump = NULL; cred = td ? td->td_ucred : NOCRED; ronly = (mp->mnt_flag & MNT_RDONLY) != 0; KASSERT(devvp->v_type == VCHR, ("reclaimed devvp")); dev = devvp->v_rdev; if (atomic_cmpset_acq_ptr((uintptr_t *)&dev->si_mountpt, 0, (uintptr_t)mp) == 0) { VOP_UNLOCK(devvp, 0); return (EBUSY); } g_topology_lock(); error = g_vfs_open(devvp, &cp, "ffs", ronly ? 0 : 1); g_topology_unlock(); if (error != 0) { atomic_store_rel_ptr((uintptr_t *)&dev->si_mountpt, 0); VOP_UNLOCK(devvp, 0); return (error); } dev_ref(dev); devvp->v_bufobj.bo_ops = &ffs_ops; VOP_UNLOCK(devvp, 0); if (dev->si_iosize_max != 0) mp->mnt_iosize_max = dev->si_iosize_max; if (mp->mnt_iosize_max > MAXPHYS) mp->mnt_iosize_max = MAXPHYS; fs = NULL; sblockloc = 0; /* * Try reading the superblock in each of its possible locations. */ for (i = 0; sblock_try[i] != -1; i++) { if ((SBLOCKSIZE % cp->provider->sectorsize) != 0) { error = EINVAL; vfs_mount_error(mp, "Invalid sectorsize %d for superblock size %d", cp->provider->sectorsize, SBLOCKSIZE); goto out; } if ((error = bread(devvp, btodb(sblock_try[i]), SBLOCKSIZE, cred, &bp)) != 0) goto out; fs = (struct fs *)bp->b_data; sblockloc = sblock_try[i]; if ((fs->fs_magic == FS_UFS1_MAGIC || (fs->fs_magic == FS_UFS2_MAGIC && (fs->fs_sblockloc == sblockloc || (fs->fs_old_flags & FS_FLAGS_UPDATED) == 0))) && fs->fs_bsize <= MAXBSIZE && fs->fs_bsize >= sizeof(struct fs)) break; brelse(bp); bp = NULL; } if (sblock_try[i] == -1) { error = EINVAL; /* XXX needs translation */ goto out; } fs->fs_fmod = 0; fs->fs_flags &= ~FS_INDEXDIRS; /* no support for directory indices */ fs->fs_flags &= ~FS_UNCLEAN; if (fs->fs_clean == 0) { fs->fs_flags |= FS_UNCLEAN; if (ronly || (mp->mnt_flag & MNT_FORCE) || ((fs->fs_flags & (FS_SUJ | FS_NEEDSFSCK)) == 0 && (fs->fs_flags & FS_DOSOFTDEP))) { printf("WARNING: %s was not properly dismounted\n", fs->fs_fsmnt); } else { vfs_mount_error(mp, "R/W mount of %s denied. %s%s", fs->fs_fsmnt, "Filesystem is not clean - run fsck.", (fs->fs_flags & FS_SUJ) == 0 ? "" : " Forced mount will invalidate journal contents"); error = EPERM; goto out; } if ((fs->fs_pendingblocks != 0 || fs->fs_pendinginodes != 0) && (mp->mnt_flag & MNT_FORCE)) { printf("WARNING: %s: lost blocks %jd files %d\n", fs->fs_fsmnt, (intmax_t)fs->fs_pendingblocks, fs->fs_pendinginodes); fs->fs_pendingblocks = 0; fs->fs_pendinginodes = 0; } } if (fs->fs_pendingblocks != 0 || fs->fs_pendinginodes != 0) { printf("WARNING: %s: mount pending error: blocks %jd " "files %d\n", fs->fs_fsmnt, (intmax_t)fs->fs_pendingblocks, fs->fs_pendinginodes); fs->fs_pendingblocks = 0; fs->fs_pendinginodes = 0; } if ((fs->fs_flags & FS_GJOURNAL) != 0) { #ifdef UFS_GJOURNAL /* * Get journal provider name. */ len = 1024; mp->mnt_gjprovider = malloc((u_long)len, M_UFSMNT, M_WAITOK); if (g_io_getattr("GJOURNAL::provider", cp, &len, mp->mnt_gjprovider) == 0) { mp->mnt_gjprovider = realloc(mp->mnt_gjprovider, len, M_UFSMNT, M_WAITOK); MNT_ILOCK(mp); mp->mnt_flag |= MNT_GJOURNAL; MNT_IUNLOCK(mp); } else { printf("WARNING: %s: GJOURNAL flag on fs " "but no gjournal provider below\n", mp->mnt_stat.f_mntonname); free(mp->mnt_gjprovider, M_UFSMNT); mp->mnt_gjprovider = NULL; } #else printf("WARNING: %s: GJOURNAL flag on fs but no " "UFS_GJOURNAL support\n", mp->mnt_stat.f_mntonname); #endif } else { mp->mnt_gjprovider = NULL; } ump = malloc(sizeof *ump, M_UFSMNT, M_WAITOK | M_ZERO); ump->um_cp = cp; ump->um_bo = &devvp->v_bufobj; ump->um_fs = malloc((u_long)fs->fs_sbsize, M_UFSMNT, M_WAITOK); if (fs->fs_magic == FS_UFS1_MAGIC) { ump->um_fstype = UFS1; ump->um_balloc = ffs_balloc_ufs1; } else { ump->um_fstype = UFS2; ump->um_balloc = ffs_balloc_ufs2; } ump->um_blkatoff = ffs_blkatoff; ump->um_truncate = ffs_truncate; ump->um_update = ffs_update; ump->um_valloc = ffs_valloc; ump->um_vfree = ffs_vfree; ump->um_ifree = ffs_ifree; ump->um_rdonly = ffs_rdonly; ump->um_snapgone = ffs_snapgone; mtx_init(UFS_MTX(ump), "FFS", "FFS Lock", MTX_DEF); bcopy(bp->b_data, ump->um_fs, (u_int)fs->fs_sbsize); if (fs->fs_sbsize < SBLOCKSIZE) bp->b_flags |= B_INVAL | B_NOCACHE; brelse(bp); bp = NULL; fs = ump->um_fs; ffs_oldfscompat_read(fs, ump, sblockloc); fs->fs_ronly = ronly; size = fs->fs_cssize; blks = howmany(size, fs->fs_fsize); if (fs->fs_contigsumsize > 0) size += fs->fs_ncg * sizeof(int32_t); size += fs->fs_ncg * sizeof(u_int8_t); space = malloc(size, M_UFSMNT, M_WAITOK); fs->fs_csp = space; for (i = 0; i < blks; i += fs->fs_frag) { size = fs->fs_bsize; if (i + fs->fs_frag > blks) size = (blks - i) * fs->fs_fsize; if ((error = bread(devvp, fsbtodb(fs, fs->fs_csaddr + i), size, cred, &bp)) != 0) { free(fs->fs_csp, M_UFSMNT); goto out; } bcopy(bp->b_data, space, (u_int)size); space = (char *)space + size; brelse(bp); bp = NULL; } if (fs->fs_contigsumsize > 0) { fs->fs_maxcluster = lp = space; for (i = 0; i < fs->fs_ncg; i++) *lp++ = fs->fs_contigsumsize; space = lp; } size = fs->fs_ncg * sizeof(u_int8_t); fs->fs_contigdirs = (u_int8_t *)space; bzero(fs->fs_contigdirs, size); fs->fs_active = NULL; mp->mnt_data = ump; mp->mnt_stat.f_fsid.val[0] = fs->fs_id[0]; mp->mnt_stat.f_fsid.val[1] = fs->fs_id[1]; nmp = NULL; if (fs->fs_id[0] == 0 || fs->fs_id[1] == 0 || (nmp = vfs_getvfs(&mp->mnt_stat.f_fsid))) { if (nmp) vfs_rel(nmp); vfs_getnewfsid(mp); } mp->mnt_maxsymlinklen = fs->fs_maxsymlinklen; MNT_ILOCK(mp); mp->mnt_flag |= MNT_LOCAL; MNT_IUNLOCK(mp); if ((fs->fs_flags & FS_MULTILABEL) != 0) { #ifdef MAC MNT_ILOCK(mp); mp->mnt_flag |= MNT_MULTILABEL; MNT_IUNLOCK(mp); #else printf("WARNING: %s: multilabel flag on fs but " "no MAC support\n", mp->mnt_stat.f_mntonname); #endif } if ((fs->fs_flags & FS_ACLS) != 0) { #ifdef UFS_ACL MNT_ILOCK(mp); if (mp->mnt_flag & MNT_NFS4ACLS) printf("WARNING: %s: ACLs flag on fs conflicts with " "\"nfsv4acls\" mount option; option ignored\n", mp->mnt_stat.f_mntonname); mp->mnt_flag &= ~MNT_NFS4ACLS; mp->mnt_flag |= MNT_ACLS; MNT_IUNLOCK(mp); #else printf("WARNING: %s: ACLs flag on fs but no ACLs support\n", mp->mnt_stat.f_mntonname); #endif } if ((fs->fs_flags & FS_NFS4ACLS) != 0) { #ifdef UFS_ACL MNT_ILOCK(mp); if (mp->mnt_flag & MNT_ACLS) printf("WARNING: %s: NFSv4 ACLs flag on fs conflicts " "with \"acls\" mount option; option ignored\n", mp->mnt_stat.f_mntonname); mp->mnt_flag &= ~MNT_ACLS; mp->mnt_flag |= MNT_NFS4ACLS; MNT_IUNLOCK(mp); #else printf("WARNING: %s: NFSv4 ACLs flag on fs but no " "ACLs support\n", mp->mnt_stat.f_mntonname); #endif } if ((fs->fs_flags & FS_TRIM) != 0) { len = sizeof(int); if (g_io_getattr("GEOM::candelete", cp, &len, &ump->um_candelete) == 0) { if (!ump->um_candelete) printf("WARNING: %s: TRIM flag on fs but disk " "does not support TRIM\n", mp->mnt_stat.f_mntonname); } else { printf("WARNING: %s: TRIM flag on fs but disk does " "not confirm that it supports TRIM\n", mp->mnt_stat.f_mntonname); ump->um_candelete = 0; } if (ump->um_candelete) { ump->um_trim_tq = taskqueue_create("trim", M_WAITOK, taskqueue_thread_enqueue, &ump->um_trim_tq); taskqueue_start_threads(&ump->um_trim_tq, 1, PVFS, "%s trim", mp->mnt_stat.f_mntonname); } } ump->um_mountp = mp; ump->um_dev = dev; ump->um_devvp = devvp; ump->um_nindir = fs->fs_nindir; ump->um_bptrtodb = fs->fs_fsbtodb; ump->um_seqinc = fs->fs_frag; for (i = 0; i < MAXQUOTAS; i++) ump->um_quotas[i] = NULLVP; #ifdef UFS_EXTATTR ufs_extattr_uepm_init(&ump->um_extattr); #endif /* * Set FS local "last mounted on" information (NULL pad) */ bzero(fs->fs_fsmnt, MAXMNTLEN); strlcpy(fs->fs_fsmnt, mp->mnt_stat.f_mntonname, MAXMNTLEN); mp->mnt_stat.f_iosize = fs->fs_bsize; if (mp->mnt_flag & MNT_ROOTFS) { /* * Root mount; update timestamp in mount structure. * this will be used by the common root mount code * to update the system clock. */ mp->mnt_time = fs->fs_time; } if (ronly == 0) { fs->fs_mtime = time_second; if ((fs->fs_flags & FS_DOSOFTDEP) && (error = softdep_mount(devvp, mp, fs, cred)) != 0) { free(fs->fs_csp, M_UFSMNT); ffs_flushfiles(mp, FORCECLOSE, td); goto out; } if (fs->fs_snapinum[0] != 0) ffs_snapshot_mount(mp); fs->fs_fmod = 1; fs->fs_clean = 0; (void) ffs_sbupdate(ump, MNT_WAIT, 0); } /* * Initialize filesystem state information in mount struct. */ MNT_ILOCK(mp); mp->mnt_kern_flag |= MNTK_LOOKUP_SHARED | MNTK_EXTENDED_SHARED | MNTK_NO_IOPF | MNTK_UNMAPPED_BUFS | MNTK_USES_BCACHE; MNT_IUNLOCK(mp); #ifdef UFS_EXTATTR #ifdef UFS_EXTATTR_AUTOSTART /* * * Auto-starting does the following: * - check for /.attribute in the fs, and extattr_start if so * - for each file in .attribute, enable that file with * an attribute of the same name. * Not clear how to report errors -- probably eat them. * This would all happen while the filesystem was busy/not * available, so would effectively be "atomic". */ (void) ufs_extattr_autostart(mp, td); #endif /* !UFS_EXTATTR_AUTOSTART */ #endif /* !UFS_EXTATTR */ return (0); out: if (bp) brelse(bp); if (cp != NULL) { g_topology_lock(); g_vfs_close(cp); g_topology_unlock(); } if (ump) { mtx_destroy(UFS_MTX(ump)); if (mp->mnt_gjprovider != NULL) { free(mp->mnt_gjprovider, M_UFSMNT); mp->mnt_gjprovider = NULL; } free(ump->um_fs, M_UFSMNT); free(ump, M_UFSMNT); mp->mnt_data = NULL; } atomic_store_rel_ptr((uintptr_t *)&dev->si_mountpt, 0); dev_rel(dev); return (error); } #include static int bigcgs = 0; SYSCTL_INT(_debug, OID_AUTO, bigcgs, CTLFLAG_RW, &bigcgs, 0, ""); /* * Sanity checks for loading old filesystem superblocks. * See ffs_oldfscompat_write below for unwound actions. * * XXX - Parts get retired eventually. * Unfortunately new bits get added. */ static void ffs_oldfscompat_read(fs, ump, sblockloc) struct fs *fs; struct ufsmount *ump; ufs2_daddr_t sblockloc; { off_t maxfilesize; /* * If not yet done, update fs_flags location and value of fs_sblockloc. */ if ((fs->fs_old_flags & FS_FLAGS_UPDATED) == 0) { fs->fs_flags = fs->fs_old_flags; fs->fs_old_flags |= FS_FLAGS_UPDATED; fs->fs_sblockloc = sblockloc; } /* * If not yet done, update UFS1 superblock with new wider fields. */ if (fs->fs_magic == FS_UFS1_MAGIC && fs->fs_maxbsize != fs->fs_bsize) { fs->fs_maxbsize = fs->fs_bsize; fs->fs_time = fs->fs_old_time; fs->fs_size = fs->fs_old_size; fs->fs_dsize = fs->fs_old_dsize; fs->fs_csaddr = fs->fs_old_csaddr; fs->fs_cstotal.cs_ndir = fs->fs_old_cstotal.cs_ndir; fs->fs_cstotal.cs_nbfree = fs->fs_old_cstotal.cs_nbfree; fs->fs_cstotal.cs_nifree = fs->fs_old_cstotal.cs_nifree; fs->fs_cstotal.cs_nffree = fs->fs_old_cstotal.cs_nffree; } if (fs->fs_magic == FS_UFS1_MAGIC && fs->fs_old_inodefmt < FS_44INODEFMT) { fs->fs_maxfilesize = ((uint64_t)1 << 31) - 1; fs->fs_qbmask = ~fs->fs_bmask; fs->fs_qfmask = ~fs->fs_fmask; } if (fs->fs_magic == FS_UFS1_MAGIC) { ump->um_savedmaxfilesize = fs->fs_maxfilesize; maxfilesize = (uint64_t)0x80000000 * fs->fs_bsize - 1; if (fs->fs_maxfilesize > maxfilesize) fs->fs_maxfilesize = maxfilesize; } /* Compatibility for old filesystems */ if (fs->fs_avgfilesize <= 0) fs->fs_avgfilesize = AVFILESIZ; if (fs->fs_avgfpdir <= 0) fs->fs_avgfpdir = AFPDIR; if (bigcgs) { fs->fs_save_cgsize = fs->fs_cgsize; fs->fs_cgsize = fs->fs_bsize; } } /* * Unwinding superblock updates for old filesystems. * See ffs_oldfscompat_read above for details. * * XXX - Parts get retired eventually. * Unfortunately new bits get added. */ void ffs_oldfscompat_write(fs, ump) struct fs *fs; struct ufsmount *ump; { /* * Copy back UFS2 updated fields that UFS1 inspects. */ if (fs->fs_magic == FS_UFS1_MAGIC) { fs->fs_old_time = fs->fs_time; fs->fs_old_cstotal.cs_ndir = fs->fs_cstotal.cs_ndir; fs->fs_old_cstotal.cs_nbfree = fs->fs_cstotal.cs_nbfree; fs->fs_old_cstotal.cs_nifree = fs->fs_cstotal.cs_nifree; fs->fs_old_cstotal.cs_nffree = fs->fs_cstotal.cs_nffree; fs->fs_maxfilesize = ump->um_savedmaxfilesize; } if (bigcgs) { fs->fs_cgsize = fs->fs_save_cgsize; fs->fs_save_cgsize = 0; } } /* * unmount system call */ static int ffs_unmount(mp, mntflags) struct mount *mp; int mntflags; { struct thread *td; struct ufsmount *ump = VFSTOUFS(mp); struct fs *fs; int error, flags, susp; #ifdef UFS_EXTATTR int e_restart; #endif flags = 0; td = curthread; fs = ump->um_fs; susp = 0; if (mntflags & MNT_FORCE) { flags |= FORCECLOSE; susp = fs->fs_ronly == 0; } #ifdef UFS_EXTATTR if ((error = ufs_extattr_stop(mp, td))) { if (error != EOPNOTSUPP) printf("WARNING: unmount %s: ufs_extattr_stop " "returned errno %d\n", mp->mnt_stat.f_mntonname, error); e_restart = 0; } else { ufs_extattr_uepm_destroy(&ump->um_extattr); e_restart = 1; } #endif if (susp) { error = vfs_write_suspend_umnt(mp); if (error != 0) goto fail1; } if (MOUNTEDSOFTDEP(mp)) error = softdep_flushfiles(mp, flags, td); else error = ffs_flushfiles(mp, flags, td); if (error != 0 && error != ENXIO) goto fail; UFS_LOCK(ump); if (fs->fs_pendingblocks != 0 || fs->fs_pendinginodes != 0) { printf("WARNING: unmount %s: pending error: blocks %jd " "files %d\n", fs->fs_fsmnt, (intmax_t)fs->fs_pendingblocks, fs->fs_pendinginodes); fs->fs_pendingblocks = 0; fs->fs_pendinginodes = 0; } UFS_UNLOCK(ump); if (MOUNTEDSOFTDEP(mp)) softdep_unmount(mp); if (fs->fs_ronly == 0 || ump->um_fsckpid > 0) { fs->fs_clean = fs->fs_flags & (FS_UNCLEAN|FS_NEEDSFSCK) ? 0 : 1; error = ffs_sbupdate(ump, MNT_WAIT, 0); if (error && error != ENXIO) { fs->fs_clean = 0; goto fail; } } if (susp) vfs_write_resume(mp, VR_START_WRITE); if (ump->um_trim_tq != NULL) { while (ump->um_trim_inflight != 0) pause("ufsutr", hz); taskqueue_drain_all(ump->um_trim_tq); taskqueue_free(ump->um_trim_tq); } g_topology_lock(); if (ump->um_fsckpid > 0) { /* * Return to normal read-only mode. */ error = g_access(ump->um_cp, 0, -1, 0); ump->um_fsckpid = 0; } g_vfs_close(ump->um_cp); g_topology_unlock(); atomic_store_rel_ptr((uintptr_t *)&ump->um_dev->si_mountpt, 0); vrele(ump->um_devvp); dev_rel(ump->um_dev); mtx_destroy(UFS_MTX(ump)); if (mp->mnt_gjprovider != NULL) { free(mp->mnt_gjprovider, M_UFSMNT); mp->mnt_gjprovider = NULL; } free(fs->fs_csp, M_UFSMNT); free(fs, M_UFSMNT); free(ump, M_UFSMNT); mp->mnt_data = NULL; MNT_ILOCK(mp); mp->mnt_flag &= ~MNT_LOCAL; MNT_IUNLOCK(mp); return (error); fail: if (susp) vfs_write_resume(mp, VR_START_WRITE); fail1: #ifdef UFS_EXTATTR if (e_restart) { ufs_extattr_uepm_init(&ump->um_extattr); #ifdef UFS_EXTATTR_AUTOSTART (void) ufs_extattr_autostart(mp, td); #endif } #endif return (error); } /* * Flush out all the files in a filesystem. */ int ffs_flushfiles(mp, flags, td) struct mount *mp; int flags; struct thread *td; { struct ufsmount *ump; int qerror, error; ump = VFSTOUFS(mp); qerror = 0; #ifdef QUOTA if (mp->mnt_flag & MNT_QUOTA) { int i; error = vflush(mp, 0, SKIPSYSTEM|flags, td); if (error) return (error); for (i = 0; i < MAXQUOTAS; i++) { error = quotaoff(td, mp, i); if (error != 0) { if ((flags & EARLYFLUSH) == 0) return (error); else qerror = error; } } /* * Here we fall through to vflush again to ensure that * we have gotten rid of all the system vnodes, unless * quotas must not be closed. */ } #endif ASSERT_VOP_LOCKED(ump->um_devvp, "ffs_flushfiles"); if (ump->um_devvp->v_vflag & VV_COPYONWRITE) { if ((error = vflush(mp, 0, SKIPSYSTEM | flags, td)) != 0) return (error); ffs_snapshot_unmount(mp); flags |= FORCECLOSE; /* * Here we fall through to vflush again to ensure * that we have gotten rid of all the system vnodes. */ } /* * Do not close system files if quotas were not closed, to be * able to sync the remaining dquots. The freeblks softupdate * workitems might hold a reference on a dquot, preventing * quotaoff() from completing. Next round of * softdep_flushworklist() iteration should process the * blockers, allowing the next run of quotaoff() to finally * flush held dquots. * * Otherwise, flush all the files. */ if (qerror == 0 && (error = vflush(mp, 0, flags, td)) != 0) return (error); /* * Flush filesystem metadata. */ vn_lock(ump->um_devvp, LK_EXCLUSIVE | LK_RETRY); error = VOP_FSYNC(ump->um_devvp, MNT_WAIT, td); VOP_UNLOCK(ump->um_devvp, 0); return (error); } /* * Get filesystem statistics. */ static int ffs_statfs(mp, sbp) struct mount *mp; struct statfs *sbp; { struct ufsmount *ump; struct fs *fs; ump = VFSTOUFS(mp); fs = ump->um_fs; if (fs->fs_magic != FS_UFS1_MAGIC && fs->fs_magic != FS_UFS2_MAGIC) panic("ffs_statfs"); sbp->f_version = STATFS_VERSION; sbp->f_bsize = fs->fs_fsize; sbp->f_iosize = fs->fs_bsize; sbp->f_blocks = fs->fs_dsize; UFS_LOCK(ump); sbp->f_bfree = fs->fs_cstotal.cs_nbfree * fs->fs_frag + fs->fs_cstotal.cs_nffree + dbtofsb(fs, fs->fs_pendingblocks); sbp->f_bavail = freespace(fs, fs->fs_minfree) + dbtofsb(fs, fs->fs_pendingblocks); - sbp->f_files = fs->fs_ncg * fs->fs_ipg - ROOTINO; + sbp->f_files = fs->fs_ncg * fs->fs_ipg - UFS_ROOTINO; sbp->f_ffree = fs->fs_cstotal.cs_nifree + fs->fs_pendinginodes; UFS_UNLOCK(ump); sbp->f_namemax = NAME_MAX; return (0); } static bool sync_doupdate(struct inode *ip) { return ((ip->i_flag & (IN_ACCESS | IN_CHANGE | IN_MODIFIED | IN_UPDATE)) != 0); } /* * For a lazy sync, we only care about access times, quotas and the * superblock. Other filesystem changes are already converted to * cylinder group blocks or inode blocks updates and are written to * disk by syncer. */ static int ffs_sync_lazy(mp) struct mount *mp; { struct vnode *mvp, *vp; struct inode *ip; struct thread *td; int allerror, error; allerror = 0; td = curthread; if ((mp->mnt_flag & MNT_NOATIME) != 0) goto qupdate; MNT_VNODE_FOREACH_ACTIVE(vp, mp, mvp) { if (vp->v_type == VNON) { VI_UNLOCK(vp); continue; } ip = VTOI(vp); /* * The IN_ACCESS flag is converted to IN_MODIFIED by * ufs_close() and ufs_getattr() by the calls to * ufs_itimes_locked(), without subsequent UFS_UPDATE(). * Test also all the other timestamp flags too, to pick up * any other cases that could be missed. */ if (!sync_doupdate(ip) && (vp->v_iflag & VI_OWEINACT) == 0) { VI_UNLOCK(vp); continue; } if ((error = vget(vp, LK_EXCLUSIVE | LK_NOWAIT | LK_INTERLOCK, td)) != 0) continue; if (sync_doupdate(ip)) error = ffs_update(vp, 0); if (error != 0) allerror = error; vput(vp); } qupdate: #ifdef QUOTA qsync(mp); #endif if (VFSTOUFS(mp)->um_fs->fs_fmod != 0 && (error = ffs_sbupdate(VFSTOUFS(mp), MNT_LAZY, 0)) != 0) allerror = error; return (allerror); } /* * Go through the disk queues to initiate sandbagged IO; * go through the inodes to write those that have been modified; * initiate the writing of the super block if it has been modified. * * Note: we are always called with the filesystem marked busy using * vfs_busy(). */ static int ffs_sync(mp, waitfor) struct mount *mp; int waitfor; { struct vnode *mvp, *vp, *devvp; struct thread *td; struct inode *ip; struct ufsmount *ump = VFSTOUFS(mp); struct fs *fs; int error, count, lockreq, allerror = 0; int suspend; int suspended; int secondary_writes; int secondary_accwrites; int softdep_deps; int softdep_accdeps; struct bufobj *bo; suspend = 0; suspended = 0; td = curthread; fs = ump->um_fs; if (fs->fs_fmod != 0 && fs->fs_ronly != 0 && ump->um_fsckpid == 0) panic("%s: ffs_sync: modification on read-only filesystem", fs->fs_fsmnt); if (waitfor == MNT_LAZY) { if (!rebooting) return (ffs_sync_lazy(mp)); waitfor = MNT_NOWAIT; } /* * Write back each (modified) inode. */ lockreq = LK_EXCLUSIVE | LK_NOWAIT; if (waitfor == MNT_SUSPEND) { suspend = 1; waitfor = MNT_WAIT; } if (waitfor == MNT_WAIT) lockreq = LK_EXCLUSIVE; lockreq |= LK_INTERLOCK | LK_SLEEPFAIL; loop: /* Grab snapshot of secondary write counts */ MNT_ILOCK(mp); secondary_writes = mp->mnt_secondary_writes; secondary_accwrites = mp->mnt_secondary_accwrites; MNT_IUNLOCK(mp); /* Grab snapshot of softdep dependency counts */ softdep_get_depcounts(mp, &softdep_deps, &softdep_accdeps); MNT_VNODE_FOREACH_ALL(vp, mp, mvp) { /* * Depend on the vnode interlock to keep things stable enough * for a quick test. Since there might be hundreds of * thousands of vnodes, we cannot afford even a subroutine * call unless there's a good chance that we have work to do. */ if (vp->v_type == VNON) { VI_UNLOCK(vp); continue; } ip = VTOI(vp); if ((ip->i_flag & (IN_ACCESS | IN_CHANGE | IN_MODIFIED | IN_UPDATE)) == 0 && vp->v_bufobj.bo_dirty.bv_cnt == 0) { VI_UNLOCK(vp); continue; } if ((error = vget(vp, lockreq, td)) != 0) { if (error == ENOENT || error == ENOLCK) { MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp); goto loop; } continue; } if ((error = ffs_syncvnode(vp, waitfor, 0)) != 0) allerror = error; vput(vp); } /* * Force stale filesystem control information to be flushed. */ if (waitfor == MNT_WAIT || rebooting) { if ((error = softdep_flushworklist(ump->um_mountp, &count, td))) allerror = error; /* Flushed work items may create new vnodes to clean */ if (allerror == 0 && count) goto loop; } #ifdef QUOTA qsync(mp); #endif devvp = ump->um_devvp; bo = &devvp->v_bufobj; BO_LOCK(bo); if (bo->bo_numoutput > 0 || bo->bo_dirty.bv_cnt > 0) { BO_UNLOCK(bo); vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY); error = VOP_FSYNC(devvp, waitfor, td); VOP_UNLOCK(devvp, 0); if (MOUNTEDSOFTDEP(mp) && (error == 0 || error == EAGAIN)) error = ffs_sbupdate(ump, waitfor, 0); if (error != 0) allerror = error; if (allerror == 0 && waitfor == MNT_WAIT) goto loop; } else if (suspend != 0) { if (softdep_check_suspend(mp, devvp, softdep_deps, softdep_accdeps, secondary_writes, secondary_accwrites) != 0) { MNT_IUNLOCK(mp); goto loop; /* More work needed */ } mtx_assert(MNT_MTX(mp), MA_OWNED); mp->mnt_kern_flag |= MNTK_SUSPEND2 | MNTK_SUSPENDED; MNT_IUNLOCK(mp); suspended = 1; } else BO_UNLOCK(bo); /* * Write back modified superblock. */ if (fs->fs_fmod != 0 && (error = ffs_sbupdate(ump, waitfor, suspended)) != 0) allerror = error; return (allerror); } int ffs_vget(mp, ino, flags, vpp) struct mount *mp; ino_t ino; int flags; struct vnode **vpp; { return (ffs_vgetf(mp, ino, flags, vpp, 0)); } int ffs_vgetf(mp, ino, flags, vpp, ffs_flags) struct mount *mp; ino_t ino; int flags; struct vnode **vpp; int ffs_flags; { struct fs *fs; struct inode *ip; struct ufsmount *ump; struct buf *bp; struct vnode *vp; int error; error = vfs_hash_get(mp, ino, flags, curthread, vpp, NULL, NULL); if (error || *vpp != NULL) return (error); /* * We must promote to an exclusive lock for vnode creation. This * can happen if lookup is passed LOCKSHARED. */ if ((flags & LK_TYPE_MASK) == LK_SHARED) { flags &= ~LK_TYPE_MASK; flags |= LK_EXCLUSIVE; } /* * We do not lock vnode creation as it is believed to be too * expensive for such rare case as simultaneous creation of vnode * for same ino by different processes. We just allow them to race * and check later to decide who wins. Let the race begin! */ ump = VFSTOUFS(mp); fs = ump->um_fs; ip = uma_zalloc(uma_inode, M_WAITOK | M_ZERO); /* Allocate a new vnode/inode. */ error = getnewvnode("ufs", mp, fs->fs_magic == FS_UFS1_MAGIC ? &ffs_vnodeops1 : &ffs_vnodeops2, &vp); if (error) { *vpp = NULL; uma_zfree(uma_inode, ip); return (error); } /* * FFS supports recursive locking. */ lockmgr(vp->v_vnlock, LK_EXCLUSIVE, NULL); VN_LOCK_AREC(vp); vp->v_data = ip; vp->v_bufobj.bo_bsize = fs->fs_bsize; ip->i_vnode = vp; ip->i_ump = ump; ip->i_number = ino; ip->i_ea_refs = 0; ip->i_nextclustercg = -1; ip->i_flag = fs->fs_magic == FS_UFS1_MAGIC ? 0 : IN_UFS2; #ifdef QUOTA { int i; for (i = 0; i < MAXQUOTAS; i++) ip->i_dquot[i] = NODQUOT; } #endif if (ffs_flags & FFSV_FORCEINSMQ) vp->v_vflag |= VV_FORCEINSMQ; error = insmntque(vp, mp); if (error != 0) { uma_zfree(uma_inode, ip); *vpp = NULL; return (error); } vp->v_vflag &= ~VV_FORCEINSMQ; error = vfs_hash_insert(vp, ino, flags, curthread, vpp, NULL, NULL); if (error || *vpp != NULL) return (error); /* Read in the disk contents for the inode, copy into the inode. */ error = bread(ump->um_devvp, fsbtodb(fs, ino_to_fsba(fs, ino)), (int)fs->fs_bsize, NOCRED, &bp); if (error) { /* * The inode does not contain anything useful, so it would * be misleading to leave it on its hash chain. With mode * still zero, it will be unlinked and returned to the free * list by vput(). */ brelse(bp); vput(vp); *vpp = NULL; return (error); } if (I_IS_UFS1(ip)) ip->i_din1 = uma_zalloc(uma_ufs1, M_WAITOK); else ip->i_din2 = uma_zalloc(uma_ufs2, M_WAITOK); ffs_load_inode(bp, ip, fs, ino); if (DOINGSOFTDEP(vp)) softdep_load_inodeblock(ip); else ip->i_effnlink = ip->i_nlink; bqrelse(bp); /* * Initialize the vnode from the inode, check for aliases. * Note that the underlying vnode may have changed. */ error = ufs_vinit(mp, I_IS_UFS1(ip) ? &ffs_fifoops1 : &ffs_fifoops2, &vp); if (error) { vput(vp); *vpp = NULL; return (error); } /* * Finish inode initialization. */ if (vp->v_type != VFIFO) { /* FFS supports shared locking for all files except fifos. */ VN_LOCK_ASHARE(vp); } /* * Set up a generation number for this inode if it does not * already have one. This should only happen on old filesystems. */ if (ip->i_gen == 0) { while (ip->i_gen == 0) ip->i_gen = arc4random(); if ((vp->v_mount->mnt_flag & MNT_RDONLY) == 0) { ip->i_flag |= IN_MODIFIED; DIP_SET(ip, i_gen, ip->i_gen); } } #ifdef MAC if ((mp->mnt_flag & MNT_MULTILABEL) && ip->i_mode) { /* * If this vnode is already allocated, and we're running * multi-label, attempt to perform a label association * from the extended attributes on the inode. */ error = mac_vnode_associate_extattr(mp, vp); if (error) { /* ufs_inactive will release ip->i_devvp ref. */ vput(vp); *vpp = NULL; return (error); } } #endif *vpp = vp; return (0); } /* * File handle to vnode * * Have to be really careful about stale file handles: * - check that the inode number is valid * - for UFS2 check that the inode number is initialized * - call ffs_vget() to get the locked inode * - check for an unallocated inode (i_mode == 0) * - check that the given client host has export rights and return * those rights via. exflagsp and credanonp */ static int ffs_fhtovp(mp, fhp, flags, vpp) struct mount *mp; struct fid *fhp; int flags; struct vnode **vpp; { struct ufid *ufhp; struct ufsmount *ump; struct fs *fs; struct cg *cgp; struct buf *bp; ino_t ino; u_int cg; int error; ufhp = (struct ufid *)fhp; ino = ufhp->ufid_ino; ump = VFSTOUFS(mp); fs = ump->um_fs; - if (ino < ROOTINO || ino >= fs->fs_ncg * fs->fs_ipg) + if (ino < UFS_ROOTINO || ino >= fs->fs_ncg * fs->fs_ipg) return (ESTALE); /* * Need to check if inode is initialized because UFS2 does lazy * initialization and nfs_fhtovp can offer arbitrary inode numbers. */ if (fs->fs_magic != FS_UFS2_MAGIC) return (ufs_fhtovp(mp, ufhp, flags, vpp)); cg = ino_to_cg(fs, ino); error = bread(ump->um_devvp, fsbtodb(fs, cgtod(fs, cg)), (int)fs->fs_cgsize, NOCRED, &bp); if (error) return (error); cgp = (struct cg *)bp->b_data; if (!cg_chkmagic(cgp) || ino >= cg * fs->fs_ipg + cgp->cg_initediblk) { brelse(bp); return (ESTALE); } brelse(bp); return (ufs_fhtovp(mp, ufhp, flags, vpp)); } /* * Initialize the filesystem. */ static int ffs_init(vfsp) struct vfsconf *vfsp; { ffs_susp_initialize(); softdep_initialize(); return (ufs_init(vfsp)); } /* * Undo the work of ffs_init(). */ static int ffs_uninit(vfsp) struct vfsconf *vfsp; { int ret; ret = ufs_uninit(vfsp); softdep_uninitialize(); ffs_susp_uninitialize(); return (ret); } /* * Write a superblock and associated information back to disk. */ int ffs_sbupdate(ump, waitfor, suspended) struct ufsmount *ump; int waitfor; int suspended; { struct fs *fs = ump->um_fs; struct buf *sbbp; struct buf *bp; int blks; void *space; int i, size, error, allerror = 0; if (fs->fs_ronly == 1 && (ump->um_mountp->mnt_flag & (MNT_RDONLY | MNT_UPDATE)) != (MNT_RDONLY | MNT_UPDATE) && ump->um_fsckpid == 0) panic("ffs_sbupdate: write read-only filesystem"); /* * We use the superblock's buf to serialize calls to ffs_sbupdate(). */ sbbp = getblk(ump->um_devvp, btodb(fs->fs_sblockloc), (int)fs->fs_sbsize, 0, 0, 0); /* * First write back the summary information. */ blks = howmany(fs->fs_cssize, fs->fs_fsize); space = fs->fs_csp; for (i = 0; i < blks; i += fs->fs_frag) { size = fs->fs_bsize; if (i + fs->fs_frag > blks) size = (blks - i) * fs->fs_fsize; bp = getblk(ump->um_devvp, fsbtodb(fs, fs->fs_csaddr + i), size, 0, 0, 0); bcopy(space, bp->b_data, (u_int)size); space = (char *)space + size; if (suspended) bp->b_flags |= B_VALIDSUSPWRT; if (waitfor != MNT_WAIT) bawrite(bp); else if ((error = bwrite(bp)) != 0) allerror = error; } /* * Now write back the superblock itself. If any errors occurred * up to this point, then fail so that the superblock avoids * being written out as clean. */ if (allerror) { brelse(sbbp); return (allerror); } bp = sbbp; if (fs->fs_magic == FS_UFS1_MAGIC && fs->fs_sblockloc != SBLOCK_UFS1 && (fs->fs_old_flags & FS_FLAGS_UPDATED) == 0) { printf("WARNING: %s: correcting fs_sblockloc from %jd to %d\n", fs->fs_fsmnt, fs->fs_sblockloc, SBLOCK_UFS1); fs->fs_sblockloc = SBLOCK_UFS1; } if (fs->fs_magic == FS_UFS2_MAGIC && fs->fs_sblockloc != SBLOCK_UFS2 && (fs->fs_old_flags & FS_FLAGS_UPDATED) == 0) { printf("WARNING: %s: correcting fs_sblockloc from %jd to %d\n", fs->fs_fsmnt, fs->fs_sblockloc, SBLOCK_UFS2); fs->fs_sblockloc = SBLOCK_UFS2; } fs->fs_fmod = 0; fs->fs_time = time_second; if (MOUNTEDSOFTDEP(ump->um_mountp)) softdep_setup_sbupdate(ump, (struct fs *)bp->b_data, bp); bcopy((caddr_t)fs, bp->b_data, (u_int)fs->fs_sbsize); ffs_oldfscompat_write((struct fs *)bp->b_data, ump); if (suspended) bp->b_flags |= B_VALIDSUSPWRT; if (waitfor != MNT_WAIT) bawrite(bp); else if ((error = bwrite(bp)) != 0) allerror = error; return (allerror); } static int ffs_extattrctl(struct mount *mp, int cmd, struct vnode *filename_vp, int attrnamespace, const char *attrname) { #ifdef UFS_EXTATTR return (ufs_extattrctl(mp, cmd, filename_vp, attrnamespace, attrname)); #else return (vfs_stdextattrctl(mp, cmd, filename_vp, attrnamespace, attrname)); #endif } static void ffs_ifree(struct ufsmount *ump, struct inode *ip) { if (ump->um_fstype == UFS1 && ip->i_din1 != NULL) uma_zfree(uma_ufs1, ip->i_din1); else if (ip->i_din2 != NULL) uma_zfree(uma_ufs2, ip->i_din2); uma_zfree(uma_inode, ip); } static int dobkgrdwrite = 1; SYSCTL_INT(_debug, OID_AUTO, dobkgrdwrite, CTLFLAG_RW, &dobkgrdwrite, 0, "Do background writes (honoring the BV_BKGRDWRITE flag)?"); /* * Complete a background write started from bwrite. */ static void ffs_backgroundwritedone(struct buf *bp) { struct bufobj *bufobj; struct buf *origbp; /* * Find the original buffer that we are writing. */ bufobj = bp->b_bufobj; BO_LOCK(bufobj); if ((origbp = gbincore(bp->b_bufobj, bp->b_lblkno)) == NULL) panic("backgroundwritedone: lost buffer"); /* * We should mark the cylinder group buffer origbp as * dirty, to not loose the failed write. */ if ((bp->b_ioflags & BIO_ERROR) != 0) origbp->b_vflags |= BV_BKGRDERR; BO_UNLOCK(bufobj); /* * Process dependencies then return any unfinished ones. */ pbrelvp(bp); if (!LIST_EMPTY(&bp->b_dep) && (bp->b_ioflags & BIO_ERROR) == 0) buf_complete(bp); #ifdef SOFTUPDATES if (!LIST_EMPTY(&bp->b_dep)) softdep_move_dependencies(bp, origbp); #endif /* * This buffer is marked B_NOCACHE so when it is released * by biodone it will be tossed. */ bp->b_flags |= B_NOCACHE; bp->b_flags &= ~B_CACHE; /* * Prevent brelse() from trying to keep and re-dirtying bp on * errors. It causes b_bufobj dereference in * bdirty()/reassignbuf(), and b_bufobj was cleared in * pbrelvp() above. */ if ((bp->b_ioflags & BIO_ERROR) != 0) bp->b_flags |= B_INVAL; bufdone(bp); BO_LOCK(bufobj); /* * Clear the BV_BKGRDINPROG flag in the original buffer * and awaken it if it is waiting for the write to complete. * If BV_BKGRDINPROG is not set in the original buffer it must * have been released and re-instantiated - which is not legal. */ KASSERT((origbp->b_vflags & BV_BKGRDINPROG), ("backgroundwritedone: lost buffer2")); origbp->b_vflags &= ~BV_BKGRDINPROG; if (origbp->b_vflags & BV_BKGRDWAIT) { origbp->b_vflags &= ~BV_BKGRDWAIT; wakeup(&origbp->b_xflags); } BO_UNLOCK(bufobj); } /* * Write, release buffer on completion. (Done by iodone * if async). Do not bother writing anything if the buffer * is invalid. * * Note that we set B_CACHE here, indicating that buffer is * fully valid and thus cacheable. This is true even of NFS * now so we set it generally. This could be set either here * or in biodone() since the I/O is synchronous. We put it * here. */ static int ffs_bufwrite(struct buf *bp) { struct buf *newbp; CTR3(KTR_BUF, "bufwrite(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); if (bp->b_flags & B_INVAL) { brelse(bp); return (0); } if (!BUF_ISLOCKED(bp)) panic("bufwrite: buffer is not busy???"); /* * If a background write is already in progress, delay * writing this block if it is asynchronous. Otherwise * wait for the background write to complete. */ BO_LOCK(bp->b_bufobj); if (bp->b_vflags & BV_BKGRDINPROG) { if (bp->b_flags & B_ASYNC) { BO_UNLOCK(bp->b_bufobj); bdwrite(bp); return (0); } bp->b_vflags |= BV_BKGRDWAIT; msleep(&bp->b_xflags, BO_LOCKPTR(bp->b_bufobj), PRIBIO, "bwrbg", 0); if (bp->b_vflags & BV_BKGRDINPROG) panic("bufwrite: still writing"); } bp->b_vflags &= ~BV_BKGRDERR; BO_UNLOCK(bp->b_bufobj); /* * If this buffer is marked for background writing and we * do not have to wait for it, make a copy and write the * copy so as to leave this buffer ready for further use. * * This optimization eats a lot of memory. If we have a page * or buffer shortfall we can't do it. */ if (dobkgrdwrite && (bp->b_xflags & BX_BKGRDWRITE) && (bp->b_flags & B_ASYNC) && !vm_page_count_severe() && !buf_dirty_count_severe()) { KASSERT(bp->b_iodone == NULL, ("bufwrite: needs chained iodone (%p)", bp->b_iodone)); /* get a new block */ newbp = geteblk(bp->b_bufsize, GB_NOWAIT_BD); if (newbp == NULL) goto normal_write; KASSERT(buf_mapped(bp), ("Unmapped cg")); memcpy(newbp->b_data, bp->b_data, bp->b_bufsize); BO_LOCK(bp->b_bufobj); bp->b_vflags |= BV_BKGRDINPROG; BO_UNLOCK(bp->b_bufobj); newbp->b_xflags |= BX_BKGRDMARKER; newbp->b_lblkno = bp->b_lblkno; newbp->b_blkno = bp->b_blkno; newbp->b_offset = bp->b_offset; newbp->b_iodone = ffs_backgroundwritedone; newbp->b_flags |= B_ASYNC; newbp->b_flags &= ~B_INVAL; pbgetvp(bp->b_vp, newbp); #ifdef SOFTUPDATES /* * Move over the dependencies. If there are rollbacks, * leave the parent buffer dirtied as it will need to * be written again. */ if (LIST_EMPTY(&bp->b_dep) || softdep_move_dependencies(bp, newbp) == 0) bundirty(bp); #else bundirty(bp); #endif /* * Initiate write on the copy, release the original. The * BKGRDINPROG flag prevents it from going away until * the background write completes. */ bqrelse(bp); bp = newbp; } else /* Mark the buffer clean */ bundirty(bp); /* Let the normal bufwrite do the rest for us */ normal_write: return (bufwrite(bp)); } static void ffs_geom_strategy(struct bufobj *bo, struct buf *bp) { struct vnode *vp; int error; struct buf *tbp; int nocopy; vp = bo2vnode(bo); if (bp->b_iocmd == BIO_WRITE) { if ((bp->b_flags & B_VALIDSUSPWRT) == 0 && bp->b_vp != NULL && bp->b_vp->v_mount != NULL && (bp->b_vp->v_mount->mnt_kern_flag & MNTK_SUSPENDED) != 0) panic("ffs_geom_strategy: bad I/O"); nocopy = bp->b_flags & B_NOCOPY; bp->b_flags &= ~(B_VALIDSUSPWRT | B_NOCOPY); if ((vp->v_vflag & VV_COPYONWRITE) && nocopy == 0 && vp->v_rdev->si_snapdata != NULL) { if ((bp->b_flags & B_CLUSTER) != 0) { runningbufwakeup(bp); TAILQ_FOREACH(tbp, &bp->b_cluster.cluster_head, b_cluster.cluster_entry) { error = ffs_copyonwrite(vp, tbp); if (error != 0 && error != EOPNOTSUPP) { bp->b_error = error; bp->b_ioflags |= BIO_ERROR; bufdone(bp); return; } } bp->b_runningbufspace = bp->b_bufsize; atomic_add_long(&runningbufspace, bp->b_runningbufspace); } else { error = ffs_copyonwrite(vp, bp); if (error != 0 && error != EOPNOTSUPP) { bp->b_error = error; bp->b_ioflags |= BIO_ERROR; bufdone(bp); return; } } } #ifdef SOFTUPDATES if ((bp->b_flags & B_CLUSTER) != 0) { TAILQ_FOREACH(tbp, &bp->b_cluster.cluster_head, b_cluster.cluster_entry) { if (!LIST_EMPTY(&tbp->b_dep)) buf_start(tbp); } } else { if (!LIST_EMPTY(&bp->b_dep)) buf_start(bp); } #endif } g_vfs_strategy(bo, bp); } int ffs_own_mount(const struct mount *mp) { if (mp->mnt_op == &ufs_vfsops) return (1); return (0); } #ifdef DDB #ifdef SOFTUPDATES /* defined in ffs_softdep.c */ extern void db_print_ffs(struct ufsmount *ump); DB_SHOW_COMMAND(ffs, db_show_ffs) { struct mount *mp; struct ufsmount *ump; if (have_addr) { ump = VFSTOUFS((struct mount *)addr); db_print_ffs(ump); return; } TAILQ_FOREACH(mp, &mountlist, mnt_list) { if (!strcmp(mp->mnt_stat.f_fstypename, ufs_vfsconf.vfc_name)) db_print_ffs(VFSTOUFS(mp)); } } #endif /* SOFTUPDATES */ #endif /* DDB */ Index: head/sys/ufs/ffs/ffs_vnops.c =================================================================== --- head/sys/ufs/ffs/ffs_vnops.c (revision 313779) +++ head/sys/ufs/ffs/ffs_vnops.c (revision 313780) @@ -1,1725 +1,1726 @@ /*- * Copyright (c) 2002, 2003 Networks Associates Technology, Inc. * All rights reserved. * * This software was developed for the FreeBSD Project by Marshall * Kirk McKusick and Network Associates Laboratories, the Security * Research Division of Network Associates, Inc. under DARPA/SPAWAR * contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA CHATS * research program * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * Copyright (c) 1982, 1986, 1989, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)ufs_readwrite.c 8.11 (Berkeley) 5/8/95 * from: $FreeBSD: .../ufs/ufs_readwrite.c,v 1.96 2002/08/12 09:22:11 phk ... * @(#)ffs_vnops.c 8.15 (Berkeley) 5/14/95 */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "opt_directio.h" #include "opt_ffs.h" #define ALIGNED_TO(ptr, s) \ (((uintptr_t)(ptr) & (_Alignof(s) - 1)) == 0) #ifdef DIRECTIO extern int ffs_rawread(struct vnode *vp, struct uio *uio, int *workdone); #endif static vop_fdatasync_t ffs_fdatasync; static vop_fsync_t ffs_fsync; static vop_getpages_t ffs_getpages; static vop_lock1_t ffs_lock; static vop_read_t ffs_read; static vop_write_t ffs_write; static int ffs_extread(struct vnode *vp, struct uio *uio, int ioflag); static int ffs_extwrite(struct vnode *vp, struct uio *uio, int ioflag, struct ucred *cred); static vop_strategy_t ffsext_strategy; static vop_closeextattr_t ffs_closeextattr; static vop_deleteextattr_t ffs_deleteextattr; static vop_getextattr_t ffs_getextattr; static vop_listextattr_t ffs_listextattr; static vop_openextattr_t ffs_openextattr; static vop_setextattr_t ffs_setextattr; static vop_vptofh_t ffs_vptofh; /* Global vfs data structures for ufs. */ struct vop_vector ffs_vnodeops1 = { .vop_default = &ufs_vnodeops, .vop_fsync = ffs_fsync, .vop_fdatasync = ffs_fdatasync, .vop_getpages = ffs_getpages, .vop_getpages_async = vnode_pager_local_getpages_async, .vop_lock1 = ffs_lock, .vop_read = ffs_read, .vop_reallocblks = ffs_reallocblks, .vop_write = ffs_write, .vop_vptofh = ffs_vptofh, }; struct vop_vector ffs_fifoops1 = { .vop_default = &ufs_fifoops, .vop_fsync = ffs_fsync, .vop_fdatasync = ffs_fdatasync, .vop_reallocblks = ffs_reallocblks, /* XXX: really ??? */ .vop_vptofh = ffs_vptofh, }; /* Global vfs data structures for ufs. */ struct vop_vector ffs_vnodeops2 = { .vop_default = &ufs_vnodeops, .vop_fsync = ffs_fsync, .vop_fdatasync = ffs_fdatasync, .vop_getpages = ffs_getpages, .vop_getpages_async = vnode_pager_local_getpages_async, .vop_lock1 = ffs_lock, .vop_read = ffs_read, .vop_reallocblks = ffs_reallocblks, .vop_write = ffs_write, .vop_closeextattr = ffs_closeextattr, .vop_deleteextattr = ffs_deleteextattr, .vop_getextattr = ffs_getextattr, .vop_listextattr = ffs_listextattr, .vop_openextattr = ffs_openextattr, .vop_setextattr = ffs_setextattr, .vop_vptofh = ffs_vptofh, }; struct vop_vector ffs_fifoops2 = { .vop_default = &ufs_fifoops, .vop_fsync = ffs_fsync, .vop_fdatasync = ffs_fdatasync, .vop_lock1 = ffs_lock, .vop_reallocblks = ffs_reallocblks, .vop_strategy = ffsext_strategy, .vop_closeextattr = ffs_closeextattr, .vop_deleteextattr = ffs_deleteextattr, .vop_getextattr = ffs_getextattr, .vop_listextattr = ffs_listextattr, .vop_openextattr = ffs_openextattr, .vop_setextattr = ffs_setextattr, .vop_vptofh = ffs_vptofh, }; /* * Synch an open file. */ /* ARGSUSED */ static int ffs_fsync(struct vop_fsync_args *ap) { struct vnode *vp; struct bufobj *bo; int error; vp = ap->a_vp; bo = &vp->v_bufobj; retry: error = ffs_syncvnode(vp, ap->a_waitfor, 0); if (error) return (error); if (ap->a_waitfor == MNT_WAIT && DOINGSOFTDEP(vp)) { error = softdep_fsync(vp); if (error) return (error); /* * The softdep_fsync() function may drop vp lock, * allowing for dirty buffers to reappear on the * bo_dirty list. Recheck and resync as needed. */ BO_LOCK(bo); if ((vp->v_type == VREG || vp->v_type == VDIR) && (bo->bo_numoutput > 0 || bo->bo_dirty.bv_cnt > 0)) { BO_UNLOCK(bo); goto retry; } BO_UNLOCK(bo); } return (0); } int ffs_syncvnode(struct vnode *vp, int waitfor, int flags) { struct inode *ip; struct bufobj *bo; struct buf *bp, *nbp; ufs_lbn_t lbn; int error, passes; bool still_dirty, wait; ip = VTOI(vp); ip->i_flag &= ~IN_NEEDSYNC; bo = &vp->v_bufobj; /* * When doing MNT_WAIT we must first flush all dependencies * on the inode. */ if (DOINGSOFTDEP(vp) && waitfor == MNT_WAIT && (error = softdep_sync_metadata(vp)) != 0) return (error); /* * Flush all dirty buffers associated with a vnode. */ error = 0; passes = 0; wait = false; /* Always do an async pass first. */ lbn = lblkno(ITOFS(ip), (ip->i_size + ITOFS(ip)->fs_bsize - 1)); BO_LOCK(bo); loop: TAILQ_FOREACH(bp, &bo->bo_dirty.bv_hd, b_bobufs) bp->b_vflags &= ~BV_SCANNED; TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) { /* * Reasons to skip this buffer: it has already been considered * on this pass, the buffer has dependencies that will cause * it to be redirtied and it has not already been deferred, * or it is already being written. */ if ((bp->b_vflags & BV_SCANNED) != 0) continue; bp->b_vflags |= BV_SCANNED; /* * Flush indirects in order, if requested. * * Note that if only datasync is requested, we can * skip indirect blocks when softupdates are not * active. Otherwise we must flush them with data, * since dependencies prevent data block writes. */ - if (waitfor == MNT_WAIT && bp->b_lblkno <= -NDADDR && + if (waitfor == MNT_WAIT && bp->b_lblkno <= -UFS_NDADDR && (lbn_level(bp->b_lblkno) >= passes || ((flags & DATA_ONLY) != 0 && !DOINGSOFTDEP(vp)))) continue; if (bp->b_lblkno > lbn) panic("ffs_syncvnode: syncing truncated data."); if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) == 0) { BO_UNLOCK(bo); } else if (wait) { if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, BO_LOCKPTR(bo)) != 0) { bp->b_vflags &= ~BV_SCANNED; goto next; } } else continue; if ((bp->b_flags & B_DELWRI) == 0) panic("ffs_fsync: not dirty"); /* * Check for dependencies and potentially complete them. */ if (!LIST_EMPTY(&bp->b_dep) && (error = softdep_sync_buf(vp, bp, wait ? MNT_WAIT : MNT_NOWAIT)) != 0) { /* I/O error. */ if (error != EBUSY) { BUF_UNLOCK(bp); return (error); } /* If we deferred once, don't defer again. */ if ((bp->b_flags & B_DEFERRED) == 0) { bp->b_flags |= B_DEFERRED; BUF_UNLOCK(bp); goto next; } } if (wait) { bremfree(bp); if ((error = bwrite(bp)) != 0) return (error); } else if ((bp->b_flags & B_CLUSTEROK)) { (void) vfs_bio_awrite(bp); } else { bremfree(bp); (void) bawrite(bp); } next: /* * Since we may have slept during the I/O, we need * to start from a known point. */ BO_LOCK(bo); nbp = TAILQ_FIRST(&bo->bo_dirty.bv_hd); } if (waitfor != MNT_WAIT) { BO_UNLOCK(bo); if ((flags & NO_INO_UPDT) != 0) return (0); else return (ffs_update(vp, 0)); } /* Drain IO to see if we're done. */ bufobj_wwait(bo, 0, 0); /* * Block devices associated with filesystems may have new I/O * requests posted for them even if the vnode is locked, so no * amount of trying will get them clean. We make several passes * as a best effort. * * Regular files may need multiple passes to flush all dependency * work as it is possible that we must write once per indirect * level, once for the leaf, and once for the inode and each of * these will be done with one sync and one async pass. */ if (bo->bo_dirty.bv_cnt > 0) { if ((flags & DATA_ONLY) == 0) { still_dirty = true; } else { /* * For data-only sync, dirty indirect buffers * are ignored. */ still_dirty = false; TAILQ_FOREACH(bp, &bo->bo_dirty.bv_hd, b_bobufs) { - if (bp->b_lblkno > -NDADDR) { + if (bp->b_lblkno > -UFS_NDADDR) { still_dirty = true; break; } } } if (still_dirty) { /* Write the inode after sync passes to flush deps. */ if (wait && DOINGSOFTDEP(vp) && (flags & NO_INO_UPDT) == 0) { BO_UNLOCK(bo); ffs_update(vp, 1); BO_LOCK(bo); } /* switch between sync/async. */ wait = !wait; - if (wait || ++passes < NIADDR + 2) + if (wait || ++passes < UFS_NIADDR + 2) goto loop; #ifdef INVARIANTS if (!vn_isdisk(vp, NULL)) vn_printf(vp, "ffs_fsync: dirty "); #endif } } BO_UNLOCK(bo); error = 0; if ((flags & DATA_ONLY) == 0) { if ((flags & NO_INO_UPDT) == 0) error = ffs_update(vp, 1); if (DOINGSUJ(vp)) softdep_journal_fsync(VTOI(vp)); } return (error); } static int ffs_fdatasync(struct vop_fdatasync_args *ap) { return (ffs_syncvnode(ap->a_vp, MNT_WAIT, DATA_ONLY)); } static int ffs_lock(ap) struct vop_lock1_args /* { struct vnode *a_vp; int a_flags; struct thread *a_td; char *file; int line; } */ *ap; { #ifndef NO_FFS_SNAPSHOT struct vnode *vp; int flags; struct lock *lkp; int result; switch (ap->a_flags & LK_TYPE_MASK) { case LK_SHARED: case LK_UPGRADE: case LK_EXCLUSIVE: vp = ap->a_vp; flags = ap->a_flags; for (;;) { #ifdef DEBUG_VFS_LOCKS KASSERT(vp->v_holdcnt != 0, ("ffs_lock %p: zero hold count", vp)); #endif lkp = vp->v_vnlock; result = _lockmgr_args(lkp, flags, VI_MTX(vp), LK_WMESG_DEFAULT, LK_PRIO_DEFAULT, LK_TIMO_DEFAULT, ap->a_file, ap->a_line); if (lkp == vp->v_vnlock || result != 0) break; /* * Apparent success, except that the vnode * mutated between snapshot file vnode and * regular file vnode while this process * slept. The lock currently held is not the * right lock. Release it, and try to get the * new lock. */ (void) _lockmgr_args(lkp, LK_RELEASE, NULL, LK_WMESG_DEFAULT, LK_PRIO_DEFAULT, LK_TIMO_DEFAULT, ap->a_file, ap->a_line); if ((flags & (LK_INTERLOCK | LK_NOWAIT)) == (LK_INTERLOCK | LK_NOWAIT)) return (EBUSY); if ((flags & LK_TYPE_MASK) == LK_UPGRADE) flags = (flags & ~LK_TYPE_MASK) | LK_EXCLUSIVE; flags &= ~LK_INTERLOCK; } break; default: result = VOP_LOCK1_APV(&ufs_vnodeops, ap); } return (result); #else return (VOP_LOCK1_APV(&ufs_vnodeops, ap)); #endif } /* * Vnode op for reading. */ static int ffs_read(ap) struct vop_read_args /* { struct vnode *a_vp; struct uio *a_uio; int a_ioflag; struct ucred *a_cred; } */ *ap; { struct vnode *vp; struct inode *ip; struct uio *uio; struct fs *fs; struct buf *bp; ufs_lbn_t lbn, nextlbn; off_t bytesinfile; long size, xfersize, blkoffset; ssize_t orig_resid; int error; int seqcount; int ioflag; vp = ap->a_vp; uio = ap->a_uio; ioflag = ap->a_ioflag; if (ap->a_ioflag & IO_EXT) #ifdef notyet return (ffs_extread(vp, uio, ioflag)); #else panic("ffs_read+IO_EXT"); #endif #ifdef DIRECTIO if ((ioflag & IO_DIRECT) != 0) { int workdone; error = ffs_rawread(vp, uio, &workdone); if (error != 0 || workdone != 0) return error; } #endif seqcount = ap->a_ioflag >> IO_SEQSHIFT; ip = VTOI(vp); #ifdef INVARIANTS if (uio->uio_rw != UIO_READ) panic("ffs_read: mode"); if (vp->v_type == VLNK) { if ((int)ip->i_size < vp->v_mount->mnt_maxsymlinklen) panic("ffs_read: short symlink"); } else if (vp->v_type != VREG && vp->v_type != VDIR) panic("ffs_read: type %d", vp->v_type); #endif orig_resid = uio->uio_resid; KASSERT(orig_resid >= 0, ("ffs_read: uio->uio_resid < 0")); if (orig_resid == 0) return (0); KASSERT(uio->uio_offset >= 0, ("ffs_read: uio->uio_offset < 0")); fs = ITOFS(ip); if (uio->uio_offset < ip->i_size && uio->uio_offset >= fs->fs_maxfilesize) return (EOVERFLOW); for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) { if ((bytesinfile = ip->i_size - uio->uio_offset) <= 0) break; lbn = lblkno(fs, uio->uio_offset); nextlbn = lbn + 1; /* * size of buffer. The buffer representing the * end of the file is rounded up to the size of * the block type ( fragment or full block, * depending ). */ size = blksize(fs, ip, lbn); blkoffset = blkoff(fs, uio->uio_offset); /* * The amount we want to transfer in this iteration is * one FS block less the amount of the data before * our startpoint (duh!) */ xfersize = fs->fs_bsize - blkoffset; /* * But if we actually want less than the block, * or the file doesn't have a whole block more of data, * then use the lesser number. */ if (uio->uio_resid < xfersize) xfersize = uio->uio_resid; if (bytesinfile < xfersize) xfersize = bytesinfile; if (lblktosize(fs, nextlbn) >= ip->i_size) { /* * Don't do readahead if this is the end of the file. */ error = bread_gb(vp, lbn, size, NOCRED, GB_UNMAPPED, &bp); } else if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERR) == 0) { /* * Otherwise if we are allowed to cluster, * grab as much as we can. * * XXX This may not be a win if we are not * doing sequential access. */ error = cluster_read(vp, ip->i_size, lbn, size, NOCRED, blkoffset + uio->uio_resid, seqcount, GB_UNMAPPED, &bp); } else if (seqcount > 1) { /* * If we are NOT allowed to cluster, then * if we appear to be acting sequentially, * fire off a request for a readahead * as well as a read. Note that the 4th and 5th * arguments point to arrays of the size specified in * the 6th argument. */ u_int nextsize = blksize(fs, ip, nextlbn); error = breadn_flags(vp, lbn, size, &nextlbn, &nextsize, 1, NOCRED, GB_UNMAPPED, &bp); } else { /* * Failing all of the above, just read what the * user asked for. Interestingly, the same as * the first option above. */ error = bread_gb(vp, lbn, size, NOCRED, GB_UNMAPPED, &bp); } if (error) { brelse(bp); bp = NULL; break; } /* * We should only get non-zero b_resid when an I/O error * has occurred, which should cause us to break above. * However, if the short read did not cause an error, * then we want to ensure that we do not uiomove bad * or uninitialized data. */ size -= bp->b_resid; if (size < xfersize) { if (size == 0) break; xfersize = size; } if (buf_mapped(bp)) { error = vn_io_fault_uiomove((char *)bp->b_data + blkoffset, (int)xfersize, uio); } else { error = vn_io_fault_pgmove(bp->b_pages, blkoffset, (int)xfersize, uio); } if (error) break; vfs_bio_brelse(bp, ioflag); } /* * This can only happen in the case of an error * because the loop above resets bp to NULL on each iteration * and on normal completion has not set a new value into it. * so it must have come from a 'break' statement */ if (bp != NULL) vfs_bio_brelse(bp, ioflag); if ((error == 0 || uio->uio_resid != orig_resid) && (vp->v_mount->mnt_flag & (MNT_NOATIME | MNT_RDONLY)) == 0 && (ip->i_flag & IN_ACCESS) == 0) { VI_LOCK(vp); ip->i_flag |= IN_ACCESS; VI_UNLOCK(vp); } return (error); } /* * Vnode op for writing. */ static int ffs_write(ap) struct vop_write_args /* { struct vnode *a_vp; struct uio *a_uio; int a_ioflag; struct ucred *a_cred; } */ *ap; { struct vnode *vp; struct uio *uio; struct inode *ip; struct fs *fs; struct buf *bp; ufs_lbn_t lbn; off_t osize; ssize_t resid; int seqcount; int blkoffset, error, flags, ioflag, size, xfersize; vp = ap->a_vp; uio = ap->a_uio; ioflag = ap->a_ioflag; if (ap->a_ioflag & IO_EXT) #ifdef notyet return (ffs_extwrite(vp, uio, ioflag, ap->a_cred)); #else panic("ffs_write+IO_EXT"); #endif seqcount = ap->a_ioflag >> IO_SEQSHIFT; ip = VTOI(vp); #ifdef INVARIANTS if (uio->uio_rw != UIO_WRITE) panic("ffs_write: mode"); #endif switch (vp->v_type) { case VREG: if (ioflag & IO_APPEND) uio->uio_offset = ip->i_size; if ((ip->i_flags & APPEND) && uio->uio_offset != ip->i_size) return (EPERM); /* FALLTHROUGH */ case VLNK: break; case VDIR: panic("ffs_write: dir write"); break; default: panic("ffs_write: type %p %d (%d,%d)", vp, (int)vp->v_type, (int)uio->uio_offset, (int)uio->uio_resid ); } KASSERT(uio->uio_resid >= 0, ("ffs_write: uio->uio_resid < 0")); KASSERT(uio->uio_offset >= 0, ("ffs_write: uio->uio_offset < 0")); fs = ITOFS(ip); if ((uoff_t)uio->uio_offset + uio->uio_resid > fs->fs_maxfilesize) return (EFBIG); /* * Maybe this should be above the vnode op call, but so long as * file servers have no limits, I don't think it matters. */ if (vn_rlimit_fsize(vp, uio, uio->uio_td)) return (EFBIG); resid = uio->uio_resid; osize = ip->i_size; if (seqcount > BA_SEQMAX) flags = BA_SEQMAX << BA_SEQSHIFT; else flags = seqcount << BA_SEQSHIFT; if (ioflag & IO_SYNC) flags |= IO_SYNC; flags |= BA_UNMAPPED; for (error = 0; uio->uio_resid > 0;) { lbn = lblkno(fs, uio->uio_offset); blkoffset = blkoff(fs, uio->uio_offset); xfersize = fs->fs_bsize - blkoffset; if (uio->uio_resid < xfersize) xfersize = uio->uio_resid; if (uio->uio_offset + xfersize > ip->i_size) vnode_pager_setsize(vp, uio->uio_offset + xfersize); /* * We must perform a read-before-write if the transfer size * does not cover the entire buffer. */ if (fs->fs_bsize > xfersize) flags |= BA_CLRBUF; else flags &= ~BA_CLRBUF; /* XXX is uio->uio_offset the right thing here? */ error = UFS_BALLOC(vp, uio->uio_offset, xfersize, ap->a_cred, flags, &bp); if (error != 0) { vnode_pager_setsize(vp, ip->i_size); break; } if ((ioflag & (IO_SYNC|IO_INVAL)) == (IO_SYNC|IO_INVAL)) bp->b_flags |= B_NOCACHE; if (uio->uio_offset + xfersize > ip->i_size) { ip->i_size = uio->uio_offset + xfersize; DIP_SET(ip, i_size, ip->i_size); } size = blksize(fs, ip, lbn) - bp->b_resid; if (size < xfersize) xfersize = size; if (buf_mapped(bp)) { error = vn_io_fault_uiomove((char *)bp->b_data + blkoffset, (int)xfersize, uio); } else { error = vn_io_fault_pgmove(bp->b_pages, blkoffset, (int)xfersize, uio); } /* * If the buffer is not already filled and we encounter an * error while trying to fill it, we have to clear out any * garbage data from the pages instantiated for the buffer. * If we do not, a failed uiomove() during a write can leave * the prior contents of the pages exposed to a userland mmap. * * Note that we need only clear buffers with a transfer size * equal to the block size because buffers with a shorter * transfer size were cleared above by the call to UFS_BALLOC() * with the BA_CLRBUF flag set. * * If the source region for uiomove identically mmaps the * buffer, uiomove() performed the NOP copy, and the buffer * content remains valid because the page fault handler * validated the pages. */ if (error != 0 && (bp->b_flags & B_CACHE) == 0 && fs->fs_bsize == xfersize) vfs_bio_clrbuf(bp); vfs_bio_set_flags(bp, ioflag); /* * If IO_SYNC each buffer is written synchronously. Otherwise * if we have a severe page deficiency write the buffer * asynchronously. Otherwise try to cluster, and if that * doesn't do it then either do an async write (if O_DIRECT), * or a delayed write (if not). */ if (ioflag & IO_SYNC) { (void)bwrite(bp); } else if (vm_page_count_severe() || buf_dirty_count_severe() || (ioflag & IO_ASYNC)) { bp->b_flags |= B_CLUSTEROK; bawrite(bp); } else if (xfersize + blkoffset == fs->fs_bsize) { if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERW) == 0) { bp->b_flags |= B_CLUSTEROK; cluster_write(vp, bp, ip->i_size, seqcount, GB_UNMAPPED); } else { bawrite(bp); } } else if (ioflag & IO_DIRECT) { bp->b_flags |= B_CLUSTEROK; bawrite(bp); } else { bp->b_flags |= B_CLUSTEROK; bdwrite(bp); } if (error || xfersize == 0) break; ip->i_flag |= IN_CHANGE | IN_UPDATE; } /* * If we successfully wrote any data, and we are not the superuser * we clear the setuid and setgid bits as a precaution against * tampering. */ if ((ip->i_mode & (ISUID | ISGID)) && resid > uio->uio_resid && ap->a_cred) { if (priv_check_cred(ap->a_cred, PRIV_VFS_RETAINSUGID, 0)) { ip->i_mode &= ~(ISUID | ISGID); DIP_SET(ip, i_mode, ip->i_mode); } } if (error) { if (ioflag & IO_UNIT) { (void)ffs_truncate(vp, osize, IO_NORMAL | (ioflag & IO_SYNC), ap->a_cred); uio->uio_offset -= resid - uio->uio_resid; uio->uio_resid = resid; } } else if (resid > uio->uio_resid && (ioflag & IO_SYNC)) error = ffs_update(vp, 1); return (error); } /* * Extended attribute area reading. */ static int ffs_extread(struct vnode *vp, struct uio *uio, int ioflag) { struct inode *ip; struct ufs2_dinode *dp; struct fs *fs; struct buf *bp; ufs_lbn_t lbn, nextlbn; off_t bytesinfile; long size, xfersize, blkoffset; ssize_t orig_resid; int error; ip = VTOI(vp); fs = ITOFS(ip); dp = ip->i_din2; #ifdef INVARIANTS if (uio->uio_rw != UIO_READ || fs->fs_magic != FS_UFS2_MAGIC) panic("ffs_extread: mode"); #endif orig_resid = uio->uio_resid; KASSERT(orig_resid >= 0, ("ffs_extread: uio->uio_resid < 0")); if (orig_resid == 0) return (0); KASSERT(uio->uio_offset >= 0, ("ffs_extread: uio->uio_offset < 0")); for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) { if ((bytesinfile = dp->di_extsize - uio->uio_offset) <= 0) break; lbn = lblkno(fs, uio->uio_offset); nextlbn = lbn + 1; /* * size of buffer. The buffer representing the * end of the file is rounded up to the size of * the block type ( fragment or full block, * depending ). */ size = sblksize(fs, dp->di_extsize, lbn); blkoffset = blkoff(fs, uio->uio_offset); /* * The amount we want to transfer in this iteration is * one FS block less the amount of the data before * our startpoint (duh!) */ xfersize = fs->fs_bsize - blkoffset; /* * But if we actually want less than the block, * or the file doesn't have a whole block more of data, * then use the lesser number. */ if (uio->uio_resid < xfersize) xfersize = uio->uio_resid; if (bytesinfile < xfersize) xfersize = bytesinfile; if (lblktosize(fs, nextlbn) >= dp->di_extsize) { /* * Don't do readahead if this is the end of the info. */ error = bread(vp, -1 - lbn, size, NOCRED, &bp); } else { /* * If we have a second block, then * fire off a request for a readahead * as well as a read. Note that the 4th and 5th * arguments point to arrays of the size specified in * the 6th argument. */ u_int nextsize = sblksize(fs, dp->di_extsize, nextlbn); nextlbn = -1 - nextlbn; error = breadn(vp, -1 - lbn, size, &nextlbn, &nextsize, 1, NOCRED, &bp); } if (error) { brelse(bp); bp = NULL; break; } /* * We should only get non-zero b_resid when an I/O error * has occurred, which should cause us to break above. * However, if the short read did not cause an error, * then we want to ensure that we do not uiomove bad * or uninitialized data. */ size -= bp->b_resid; if (size < xfersize) { if (size == 0) break; xfersize = size; } error = uiomove((char *)bp->b_data + blkoffset, (int)xfersize, uio); if (error) break; vfs_bio_brelse(bp, ioflag); } /* * This can only happen in the case of an error * because the loop above resets bp to NULL on each iteration * and on normal completion has not set a new value into it. * so it must have come from a 'break' statement */ if (bp != NULL) vfs_bio_brelse(bp, ioflag); return (error); } /* * Extended attribute area writing. */ static int ffs_extwrite(struct vnode *vp, struct uio *uio, int ioflag, struct ucred *ucred) { struct inode *ip; struct ufs2_dinode *dp; struct fs *fs; struct buf *bp; ufs_lbn_t lbn; off_t osize; ssize_t resid; int blkoffset, error, flags, size, xfersize; ip = VTOI(vp); fs = ITOFS(ip); dp = ip->i_din2; #ifdef INVARIANTS if (uio->uio_rw != UIO_WRITE || fs->fs_magic != FS_UFS2_MAGIC) panic("ffs_extwrite: mode"); #endif if (ioflag & IO_APPEND) uio->uio_offset = dp->di_extsize; KASSERT(uio->uio_offset >= 0, ("ffs_extwrite: uio->uio_offset < 0")); KASSERT(uio->uio_resid >= 0, ("ffs_extwrite: uio->uio_resid < 0")); - if ((uoff_t)uio->uio_offset + uio->uio_resid > NXADDR * fs->fs_bsize) + if ((uoff_t)uio->uio_offset + uio->uio_resid > + UFS_NXADDR * fs->fs_bsize) return (EFBIG); resid = uio->uio_resid; osize = dp->di_extsize; flags = IO_EXT; if (ioflag & IO_SYNC) flags |= IO_SYNC; for (error = 0; uio->uio_resid > 0;) { lbn = lblkno(fs, uio->uio_offset); blkoffset = blkoff(fs, uio->uio_offset); xfersize = fs->fs_bsize - blkoffset; if (uio->uio_resid < xfersize) xfersize = uio->uio_resid; /* * We must perform a read-before-write if the transfer size * does not cover the entire buffer. */ if (fs->fs_bsize > xfersize) flags |= BA_CLRBUF; else flags &= ~BA_CLRBUF; error = UFS_BALLOC(vp, uio->uio_offset, xfersize, ucred, flags, &bp); if (error != 0) break; /* * If the buffer is not valid we have to clear out any * garbage data from the pages instantiated for the buffer. * If we do not, a failed uiomove() during a write can leave * the prior contents of the pages exposed to a userland * mmap(). XXX deal with uiomove() errors a better way. */ if ((bp->b_flags & B_CACHE) == 0 && fs->fs_bsize <= xfersize) vfs_bio_clrbuf(bp); if (uio->uio_offset + xfersize > dp->di_extsize) dp->di_extsize = uio->uio_offset + xfersize; size = sblksize(fs, dp->di_extsize, lbn) - bp->b_resid; if (size < xfersize) xfersize = size; error = uiomove((char *)bp->b_data + blkoffset, (int)xfersize, uio); vfs_bio_set_flags(bp, ioflag); /* * If IO_SYNC each buffer is written synchronously. Otherwise * if we have a severe page deficiency write the buffer * asynchronously. Otherwise try to cluster, and if that * doesn't do it then either do an async write (if O_DIRECT), * or a delayed write (if not). */ if (ioflag & IO_SYNC) { (void)bwrite(bp); } else if (vm_page_count_severe() || buf_dirty_count_severe() || xfersize + blkoffset == fs->fs_bsize || (ioflag & (IO_ASYNC | IO_DIRECT))) bawrite(bp); else bdwrite(bp); if (error || xfersize == 0) break; ip->i_flag |= IN_CHANGE; } /* * If we successfully wrote any data, and we are not the superuser * we clear the setuid and setgid bits as a precaution against * tampering. */ if ((ip->i_mode & (ISUID | ISGID)) && resid > uio->uio_resid && ucred) { if (priv_check_cred(ucred, PRIV_VFS_RETAINSUGID, 0)) { ip->i_mode &= ~(ISUID | ISGID); dp->di_mode = ip->i_mode; } } if (error) { if (ioflag & IO_UNIT) { (void)ffs_truncate(vp, osize, IO_EXT | (ioflag&IO_SYNC), ucred); uio->uio_offset -= resid - uio->uio_resid; uio->uio_resid = resid; } } else if (resid > uio->uio_resid && (ioflag & IO_SYNC)) error = ffs_update(vp, 1); return (error); } /* * Vnode operating to retrieve a named extended attribute. * * Locate a particular EA (nspace:name) in the area (ptr:length), and return * the length of the EA, and possibly the pointer to the entry and to the data. */ static int ffs_findextattr(u_char *ptr, u_int length, int nspace, const char *name, struct extattr **eapp, u_char **eac) { struct extattr *eap, *eaend; size_t nlen; nlen = strlen(name); KASSERT(ALIGNED_TO(ptr, struct extattr), ("unaligned")); eap = (struct extattr *)ptr; eaend = (struct extattr *)(ptr + length); for (; eap < eaend; eap = EXTATTR_NEXT(eap)) { /* make sure this entry is complete */ if (EXTATTR_NEXT(eap) > eaend) break; if (eap->ea_namespace != nspace || eap->ea_namelength != nlen || memcmp(eap->ea_name, name, nlen) != 0) continue; if (eapp != NULL) *eapp = eap; if (eac != NULL) *eac = EXTATTR_CONTENT(eap); return (EXTATTR_CONTENT_SIZE(eap)); } return (-1); } static int ffs_rdextattr(u_char **p, struct vnode *vp, struct thread *td, int extra) { struct inode *ip; struct ufs2_dinode *dp; struct fs *fs; struct uio luio; struct iovec liovec; u_int easize; int error; u_char *eae; ip = VTOI(vp); fs = ITOFS(ip); dp = ip->i_din2; easize = dp->di_extsize; - if ((uoff_t)easize + extra > NXADDR * fs->fs_bsize) + if ((uoff_t)easize + extra > UFS_NXADDR * fs->fs_bsize) return (EFBIG); eae = malloc(easize + extra, M_TEMP, M_WAITOK); liovec.iov_base = eae; liovec.iov_len = easize; luio.uio_iov = &liovec; luio.uio_iovcnt = 1; luio.uio_offset = 0; luio.uio_resid = easize; luio.uio_segflg = UIO_SYSSPACE; luio.uio_rw = UIO_READ; luio.uio_td = td; error = ffs_extread(vp, &luio, IO_EXT | IO_SYNC); if (error) { free(eae, M_TEMP); return(error); } *p = eae; return (0); } static void ffs_lock_ea(struct vnode *vp) { struct inode *ip; ip = VTOI(vp); VI_LOCK(vp); while (ip->i_flag & IN_EA_LOCKED) { ip->i_flag |= IN_EA_LOCKWAIT; msleep(&ip->i_ea_refs, &vp->v_interlock, PINOD + 2, "ufs_ea", 0); } ip->i_flag |= IN_EA_LOCKED; VI_UNLOCK(vp); } static void ffs_unlock_ea(struct vnode *vp) { struct inode *ip; ip = VTOI(vp); VI_LOCK(vp); if (ip->i_flag & IN_EA_LOCKWAIT) wakeup(&ip->i_ea_refs); ip->i_flag &= ~(IN_EA_LOCKED | IN_EA_LOCKWAIT); VI_UNLOCK(vp); } static int ffs_open_ea(struct vnode *vp, struct ucred *cred, struct thread *td) { struct inode *ip; struct ufs2_dinode *dp; int error; ip = VTOI(vp); ffs_lock_ea(vp); if (ip->i_ea_area != NULL) { ip->i_ea_refs++; ffs_unlock_ea(vp); return (0); } dp = ip->i_din2; error = ffs_rdextattr(&ip->i_ea_area, vp, td, 0); if (error) { ffs_unlock_ea(vp); return (error); } ip->i_ea_len = dp->di_extsize; ip->i_ea_error = 0; ip->i_ea_refs++; ffs_unlock_ea(vp); return (0); } /* * Vnode extattr transaction commit/abort */ static int ffs_close_ea(struct vnode *vp, int commit, struct ucred *cred, struct thread *td) { struct inode *ip; struct uio luio; struct iovec liovec; int error; struct ufs2_dinode *dp; ip = VTOI(vp); ffs_lock_ea(vp); if (ip->i_ea_area == NULL) { ffs_unlock_ea(vp); return (EINVAL); } dp = ip->i_din2; error = ip->i_ea_error; if (commit && error == 0) { ASSERT_VOP_ELOCKED(vp, "ffs_close_ea commit"); if (cred == NOCRED) cred = vp->v_mount->mnt_cred; liovec.iov_base = ip->i_ea_area; liovec.iov_len = ip->i_ea_len; luio.uio_iov = &liovec; luio.uio_iovcnt = 1; luio.uio_offset = 0; luio.uio_resid = ip->i_ea_len; luio.uio_segflg = UIO_SYSSPACE; luio.uio_rw = UIO_WRITE; luio.uio_td = td; /* XXX: I'm not happy about truncating to zero size */ if (ip->i_ea_len < dp->di_extsize) error = ffs_truncate(vp, 0, IO_EXT, cred); error = ffs_extwrite(vp, &luio, IO_EXT | IO_SYNC, cred); } if (--ip->i_ea_refs == 0) { free(ip->i_ea_area, M_TEMP); ip->i_ea_area = NULL; ip->i_ea_len = 0; ip->i_ea_error = 0; } ffs_unlock_ea(vp); return (error); } /* * Vnode extattr strategy routine for fifos. * * We need to check for a read or write of the external attributes. * Otherwise we just fall through and do the usual thing. */ static int ffsext_strategy(struct vop_strategy_args *ap) /* struct vop_strategy_args { struct vnodeop_desc *a_desc; struct vnode *a_vp; struct buf *a_bp; }; */ { struct vnode *vp; daddr_t lbn; vp = ap->a_vp; lbn = ap->a_bp->b_lblkno; - if (I_IS_UFS2(VTOI(vp)) && lbn < 0 && lbn >= -NXADDR) + if (I_IS_UFS2(VTOI(vp)) && lbn < 0 && lbn >= -UFS_NXADDR) return (VOP_STRATEGY_APV(&ufs_vnodeops, ap)); if (vp->v_type == VFIFO) return (VOP_STRATEGY_APV(&ufs_fifoops, ap)); panic("spec nodes went here"); } /* * Vnode extattr transaction commit/abort */ static int ffs_openextattr(struct vop_openextattr_args *ap) /* struct vop_openextattr_args { struct vnodeop_desc *a_desc; struct vnode *a_vp; IN struct ucred *a_cred; IN struct thread *a_td; }; */ { if (ap->a_vp->v_type == VCHR || ap->a_vp->v_type == VBLK) return (EOPNOTSUPP); return (ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td)); } /* * Vnode extattr transaction commit/abort */ static int ffs_closeextattr(struct vop_closeextattr_args *ap) /* struct vop_closeextattr_args { struct vnodeop_desc *a_desc; struct vnode *a_vp; int a_commit; IN struct ucred *a_cred; IN struct thread *a_td; }; */ { if (ap->a_vp->v_type == VCHR || ap->a_vp->v_type == VBLK) return (EOPNOTSUPP); if (ap->a_commit && (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY)) return (EROFS); return (ffs_close_ea(ap->a_vp, ap->a_commit, ap->a_cred, ap->a_td)); } /* * Vnode operation to remove a named attribute. */ static int ffs_deleteextattr(struct vop_deleteextattr_args *ap) /* vop_deleteextattr { IN struct vnode *a_vp; IN int a_attrnamespace; IN const char *a_name; IN struct ucred *a_cred; IN struct thread *a_td; }; */ { struct inode *ip; struct fs *fs; struct extattr *eap; uint32_t ul; int olen, error, i, easize; u_char *eae; void *tmp; ip = VTOI(ap->a_vp); fs = ITOFS(ip); if (ap->a_vp->v_type == VCHR || ap->a_vp->v_type == VBLK) return (EOPNOTSUPP); if (strlen(ap->a_name) == 0) return (EINVAL); if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY) return (EROFS); error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, ap->a_cred, ap->a_td, VWRITE); if (error) { /* * ffs_lock_ea is not needed there, because the vnode * must be exclusively locked. */ if (ip->i_ea_area != NULL && ip->i_ea_error == 0) ip->i_ea_error = error; return (error); } error = ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td); if (error) return (error); /* CEM: delete could be done in-place instead */ eae = malloc(ip->i_ea_len, M_TEMP, M_WAITOK); bcopy(ip->i_ea_area, eae, ip->i_ea_len); easize = ip->i_ea_len; olen = ffs_findextattr(eae, easize, ap->a_attrnamespace, ap->a_name, &eap, NULL); if (olen == -1) { /* delete but nonexistent */ free(eae, M_TEMP); ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td); return (ENOATTR); } ul = eap->ea_length; i = (u_char *)EXTATTR_NEXT(eap) - eae; bcopy(EXTATTR_NEXT(eap), eap, easize - i); easize -= ul; tmp = ip->i_ea_area; ip->i_ea_area = eae; ip->i_ea_len = easize; free(tmp, M_TEMP); error = ffs_close_ea(ap->a_vp, 1, ap->a_cred, ap->a_td); return (error); } /* * Vnode operation to retrieve a named extended attribute. */ static int ffs_getextattr(struct vop_getextattr_args *ap) /* vop_getextattr { IN struct vnode *a_vp; IN int a_attrnamespace; IN const char *a_name; INOUT struct uio *a_uio; OUT size_t *a_size; IN struct ucred *a_cred; IN struct thread *a_td; }; */ { struct inode *ip; u_char *eae, *p; unsigned easize; int error, ealen; ip = VTOI(ap->a_vp); if (ap->a_vp->v_type == VCHR || ap->a_vp->v_type == VBLK) return (EOPNOTSUPP); error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, ap->a_cred, ap->a_td, VREAD); if (error) return (error); error = ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td); if (error) return (error); eae = ip->i_ea_area; easize = ip->i_ea_len; ealen = ffs_findextattr(eae, easize, ap->a_attrnamespace, ap->a_name, NULL, &p); if (ealen >= 0) { error = 0; if (ap->a_size != NULL) *ap->a_size = ealen; else if (ap->a_uio != NULL) error = uiomove(p, ealen, ap->a_uio); } else error = ENOATTR; ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td); return (error); } /* * Vnode operation to retrieve extended attributes on a vnode. */ static int ffs_listextattr(struct vop_listextattr_args *ap) /* vop_listextattr { IN struct vnode *a_vp; IN int a_attrnamespace; INOUT struct uio *a_uio; OUT size_t *a_size; IN struct ucred *a_cred; IN struct thread *a_td; }; */ { struct inode *ip; struct extattr *eap, *eaend; int error, ealen; ip = VTOI(ap->a_vp); if (ap->a_vp->v_type == VCHR || ap->a_vp->v_type == VBLK) return (EOPNOTSUPP); error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, ap->a_cred, ap->a_td, VREAD); if (error) return (error); error = ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td); if (error) return (error); error = 0; if (ap->a_size != NULL) *ap->a_size = 0; KASSERT(ALIGNED_TO(ip->i_ea_area, struct extattr), ("unaligned")); eap = (struct extattr *)ip->i_ea_area; eaend = (struct extattr *)(ip->i_ea_area + ip->i_ea_len); for (; error == 0 && eap < eaend; eap = EXTATTR_NEXT(eap)) { /* make sure this entry is complete */ if (EXTATTR_NEXT(eap) > eaend) break; if (eap->ea_namespace != ap->a_attrnamespace) continue; ealen = eap->ea_namelength; if (ap->a_size != NULL) *ap->a_size += ealen + 1; else if (ap->a_uio != NULL) error = uiomove(&eap->ea_namelength, ealen + 1, ap->a_uio); } ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td); return (error); } /* * Vnode operation to set a named attribute. */ static int ffs_setextattr(struct vop_setextattr_args *ap) /* vop_setextattr { IN struct vnode *a_vp; IN int a_attrnamespace; IN const char *a_name; INOUT struct uio *a_uio; IN struct ucred *a_cred; IN struct thread *a_td; }; */ { struct inode *ip; struct fs *fs; struct extattr *eap; uint32_t ealength, ul; ssize_t ealen; int olen, eapad1, eapad2, error, i, easize; u_char *eae; void *tmp; ip = VTOI(ap->a_vp); fs = ITOFS(ip); if (ap->a_vp->v_type == VCHR || ap->a_vp->v_type == VBLK) return (EOPNOTSUPP); if (strlen(ap->a_name) == 0) return (EINVAL); /* XXX Now unsupported API to delete EAs using NULL uio. */ if (ap->a_uio == NULL) return (EOPNOTSUPP); if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY) return (EROFS); ealen = ap->a_uio->uio_resid; - if (ealen < 0 || ealen > lblktosize(fs, NXADDR)) + if (ealen < 0 || ealen > lblktosize(fs, UFS_NXADDR)) return (EINVAL); error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, ap->a_cred, ap->a_td, VWRITE); if (error) { /* * ffs_lock_ea is not needed there, because the vnode * must be exclusively locked. */ if (ip->i_ea_area != NULL && ip->i_ea_error == 0) ip->i_ea_error = error; return (error); } error = ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td); if (error) return (error); ealength = sizeof(uint32_t) + 3 + strlen(ap->a_name); eapad1 = roundup2(ealength, 8) - ealength; eapad2 = roundup2(ealen, 8) - ealen; ealength += eapad1 + ealen + eapad2; /* * CEM: rewrites of the same size or smaller could be done in-place * instead. (We don't acquire any fine-grained locks in here either, * so we could also do bigger writes in-place.) */ eae = malloc(ip->i_ea_len + ealength, M_TEMP, M_WAITOK); bcopy(ip->i_ea_area, eae, ip->i_ea_len); easize = ip->i_ea_len; olen = ffs_findextattr(eae, easize, ap->a_attrnamespace, ap->a_name, &eap, NULL); if (olen == -1) { /* new, append at end */ KASSERT(ALIGNED_TO(eae + easize, struct extattr), ("unaligned")); eap = (struct extattr *)(eae + easize); easize += ealength; } else { ul = eap->ea_length; i = (u_char *)EXTATTR_NEXT(eap) - eae; if (ul != ealength) { bcopy(EXTATTR_NEXT(eap), (u_char *)eap + ealength, easize - i); easize += (ealength - ul); } } - if (easize > lblktosize(fs, NXADDR)) { + if (easize > lblktosize(fs, UFS_NXADDR)) { free(eae, M_TEMP); ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td); if (ip->i_ea_area != NULL && ip->i_ea_error == 0) ip->i_ea_error = ENOSPC; return (ENOSPC); } eap->ea_length = ealength; eap->ea_namespace = ap->a_attrnamespace; eap->ea_contentpadlen = eapad2; eap->ea_namelength = strlen(ap->a_name); memcpy(eap->ea_name, ap->a_name, strlen(ap->a_name)); bzero(&eap->ea_name[strlen(ap->a_name)], eapad1); error = uiomove(EXTATTR_CONTENT(eap), ealen, ap->a_uio); if (error) { free(eae, M_TEMP); ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td); if (ip->i_ea_area != NULL && ip->i_ea_error == 0) ip->i_ea_error = error; return (error); } bzero((u_char *)EXTATTR_CONTENT(eap) + ealen, eapad2); tmp = ip->i_ea_area; ip->i_ea_area = eae; ip->i_ea_len = easize; free(tmp, M_TEMP); error = ffs_close_ea(ap->a_vp, 1, ap->a_cred, ap->a_td); return (error); } /* * Vnode pointer to File handle */ static int ffs_vptofh(struct vop_vptofh_args *ap) /* vop_vptofh { IN struct vnode *a_vp; IN struct fid *a_fhp; }; */ { struct inode *ip; struct ufid *ufhp; ip = VTOI(ap->a_vp); ufhp = (struct ufid *)ap->a_fhp; ufhp->ufid_len = sizeof(struct ufid); ufhp->ufid_ino = ip->i_number; ufhp->ufid_gen = ip->i_gen; return (0); } SYSCTL_DECL(_vfs_ffs); static int use_buf_pager = 1; SYSCTL_INT(_vfs_ffs, OID_AUTO, use_buf_pager, CTLFLAG_RWTUN, &use_buf_pager, 0, "Always use buffer pager instead of bmap"); static daddr_t ffs_gbp_getblkno(struct vnode *vp, vm_ooffset_t off) { return (lblkno(VFSTOUFS(vp->v_mount)->um_fs, off)); } static int ffs_gbp_getblksz(struct vnode *vp, daddr_t lbn) { return (blksize(VFSTOUFS(vp->v_mount)->um_fs, VTOI(vp), lbn)); } static int ffs_getpages(struct vop_getpages_args *ap) { struct vnode *vp; struct ufsmount *um; vp = ap->a_vp; um = VFSTOUFS(vp->v_mount); if (!use_buf_pager && um->um_devvp->v_bufobj.bo_bsize <= PAGE_SIZE) return (vnode_pager_generic_getpages(vp, ap->a_m, ap->a_count, ap->a_rbehind, ap->a_rahead, NULL, NULL)); return (vfs_bio_getpages(vp, ap->a_m, ap->a_count, ap->a_rbehind, ap->a_rahead, ffs_gbp_getblkno, ffs_gbp_getblksz)); } Index: head/sys/ufs/ffs/fs.h =================================================================== --- head/sys/ufs/ffs/fs.h (revision 313779) +++ head/sys/ufs/ffs/fs.h (revision 313780) @@ -1,778 +1,778 @@ /*- * Copyright (c) 1982, 1986, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)fs.h 8.13 (Berkeley) 3/21/95 * $FreeBSD$ */ #ifndef _UFS_FFS_FS_H_ #define _UFS_FFS_FS_H_ #include #include /* * Each disk drive contains some number of filesystems. * A filesystem consists of a number of cylinder groups. * Each cylinder group has inodes and data. * * A filesystem is described by its super-block, which in turn * describes the cylinder groups. The super-block is critical * data and is replicated in each cylinder group to protect against * catastrophic loss. This is done at `newfs' time and the critical * super-block data does not change, so the copies need not be * referenced further unless disaster strikes. * * For filesystem fs, the offsets of the various blocks of interest * are given in the super block as: * [fs->fs_sblkno] Super-block * [fs->fs_cblkno] Cylinder group block * [fs->fs_iblkno] Inode blocks * [fs->fs_dblkno] Data blocks * The beginning of cylinder group cg in fs, is given by * the ``cgbase(fs, cg)'' macro. * * Depending on the architecture and the media, the superblock may * reside in any one of four places. For tiny media where every block * counts, it is placed at the very front of the partition. Historically, * UFS1 placed it 8K from the front to leave room for the disk label and * a small bootstrap. For UFS2 it got moved to 64K from the front to leave * room for the disk label and a bigger bootstrap, and for really piggy * systems we check at 256K from the front if the first three fail. In * all cases the size of the superblock will be SBLOCKSIZE. All values are * given in byte-offset form, so they do not imply a sector size. The * SBLOCKSEARCH specifies the order in which the locations should be searched. */ #define SBLOCK_FLOPPY 0 #define SBLOCK_UFS1 8192 #define SBLOCK_UFS2 65536 #define SBLOCK_PIGGY 262144 #define SBLOCKSIZE 8192 #define SBLOCKSEARCH \ { SBLOCK_UFS2, SBLOCK_UFS1, SBLOCK_FLOPPY, SBLOCK_PIGGY, -1 } /* * Max number of fragments per block. This value is NOT tweakable. */ #define MAXFRAG 8 /* * Addresses stored in inodes are capable of addressing fragments * of `blocks'. File system blocks of at most size MAXBSIZE can * be optionally broken into 2, 4, or 8 pieces, each of which is * addressable; these pieces may be DEV_BSIZE, or some multiple of * a DEV_BSIZE unit. * * Large files consist of exclusively large data blocks. To avoid * undue wasted disk space, the last data block of a small file may be * allocated as only as many fragments of a large block as are * necessary. The filesystem format retains only a single pointer * to such a fragment, which is a piece of a single large block that * has been divided. The size of such a fragment is determinable from * information in the inode, using the ``blksize(fs, ip, lbn)'' macro. * * The filesystem records space availability at the fragment level; * to determine block availability, aligned fragments are examined. */ /* * MINBSIZE is the smallest allowable block size. * In order to insure that it is possible to create files of size * 2^32 with only two levels of indirection, MINBSIZE is set to 4096. * MINBSIZE must be big enough to hold a cylinder group block, * thus changes to (struct cg) must keep its size within MINBSIZE. * Note that super blocks are always of size SBLOCKSIZE, * and that both SBLOCKSIZE and MAXBSIZE must be >= MINBSIZE. */ #define MINBSIZE 4096 /* * The path name on which the filesystem is mounted is maintained * in fs_fsmnt. MAXMNTLEN defines the amount of space allocated in * the super block for this name. */ #define MAXMNTLEN 468 /* * The volume name for this filesystem is maintained in fs_volname. * MAXVOLLEN defines the length of the buffer allocated. */ #define MAXVOLLEN 32 /* * There is a 128-byte region in the superblock reserved for in-core * pointers to summary information. Originally this included an array * of pointers to blocks of struct csum; now there are just a few * pointers and the remaining space is padded with fs_ocsp[]. * * NOCSPTRS determines the size of this padding. One pointer (fs_csp) * is taken away to point to a contiguous array of struct csum for * all cylinder groups; a second (fs_maxcluster) points to an array * of cluster sizes that is computed as cylinder groups are inspected, * and the third points to an array that tracks the creation of new * directories. A fourth pointer, fs_active, is used when creating * snapshots; it points to a bitmap of cylinder groups for which the * free-block bitmap has changed since the snapshot operation began. */ #define NOCSPTRS ((128 / sizeof(void *)) - 4) /* * A summary of contiguous blocks of various sizes is maintained * in each cylinder group. Normally this is set by the initial * value of fs_maxcontig. To conserve space, a maximum summary size * is set by FS_MAXCONTIG. */ #define FS_MAXCONTIG 16 /* * MINFREE gives the minimum acceptable percentage of filesystem * blocks which may be free. If the freelist drops below this level * only the superuser may continue to allocate blocks. This may * be set to 0 if no reserve of free blocks is deemed necessary, * however throughput drops by fifty percent if the filesystem * is run at between 95% and 100% full; thus the minimum default * value of fs_minfree is 5%. However, to get good clustering * performance, 10% is a better choice. hence we use 10% as our * default value. With 10% free space, fragmentation is not a * problem, so we choose to optimize for time. */ #define MINFREE 8 #define DEFAULTOPT FS_OPTTIME /* * Grigoriy Orlov has done some extensive work to fine * tune the layout preferences for directories within a filesystem. * His algorithm can be tuned by adjusting the following parameters * which tell the system the average file size and the average number * of files per directory. These defaults are well selected for typical * filesystems, but may need to be tuned for odd cases like filesystems * being used for squid caches or news spools. */ #define AVFILESIZ 16384 /* expected average file size */ #define AFPDIR 64 /* expected number of files per directory */ /* * The maximum number of snapshot nodes that can be associated * with each filesystem. This limit affects only the number of * snapshot files that can be recorded within the superblock so * that they can be found when the filesystem is mounted. However, * maintaining too many will slow the filesystem performance, so * having this limit is a good idea. */ #define FSMAXSNAP 20 /* * Used to identify special blocks in snapshots: * * BLK_NOCOPY - A block that was unallocated at the time the snapshot * was taken, hence does not need to be copied when written. * BLK_SNAP - A block held by another snapshot that is not needed by this * snapshot. When the other snapshot is freed, the BLK_SNAP entries * are converted to BLK_NOCOPY. These are needed to allow fsck to * identify blocks that are in use by other snapshots (which are * expunged from this snapshot). */ #define BLK_NOCOPY ((ufs2_daddr_t)(1)) #define BLK_SNAP ((ufs2_daddr_t)(2)) /* * Sysctl values for the fast filesystem. */ #define FFS_ADJ_REFCNT 1 /* adjust inode reference count */ #define FFS_ADJ_BLKCNT 2 /* adjust inode used block count */ #define FFS_BLK_FREE 3 /* free range of blocks in map */ #define FFS_DIR_FREE 4 /* free specified dir inodes in map */ #define FFS_FILE_FREE 5 /* free specified file inodes in map */ #define FFS_SET_FLAGS 6 /* set filesystem flags */ #define FFS_ADJ_NDIR 7 /* adjust number of directories */ #define FFS_ADJ_NBFREE 8 /* adjust number of free blocks */ #define FFS_ADJ_NIFREE 9 /* adjust number of free inodes */ #define FFS_ADJ_NFFREE 10 /* adjust number of free frags */ #define FFS_ADJ_NUMCLUSTERS 11 /* adjust number of free clusters */ #define FFS_SET_CWD 12 /* set current directory */ #define FFS_SET_DOTDOT 13 /* set inode number for ".." */ #define FFS_UNLINK 14 /* remove a name in the filesystem */ #define FFS_SET_INODE 15 /* update an on-disk inode */ #define FFS_SET_BUFOUTPUT 16 /* set buffered writing on descriptor */ #define FFS_MAXID 16 /* number of valid ffs ids */ /* * Command structure passed in to the filesystem to adjust filesystem values. */ #define FFS_CMD_VERSION 0x19790518 /* version ID */ struct fsck_cmd { int32_t version; /* version of command structure */ int32_t handle; /* reference to filesystem to be changed */ int64_t value; /* inode or block number to be affected */ int64_t size; /* amount or range to be adjusted */ int64_t spare; /* reserved for future use */ }; /* * Per cylinder group information; summarized in blocks allocated * from first cylinder group data blocks. These blocks have to be * read in from fs_csaddr (size fs_cssize) in addition to the * super block. */ struct csum { int32_t cs_ndir; /* number of directories */ int32_t cs_nbfree; /* number of free blocks */ int32_t cs_nifree; /* number of free inodes */ int32_t cs_nffree; /* number of free frags */ }; struct csum_total { int64_t cs_ndir; /* number of directories */ int64_t cs_nbfree; /* number of free blocks */ int64_t cs_nifree; /* number of free inodes */ int64_t cs_nffree; /* number of free frags */ int64_t cs_numclusters; /* number of free clusters */ int64_t cs_spare[3]; /* future expansion */ }; /* * Super block for an FFS filesystem. */ struct fs { int32_t fs_firstfield; /* historic filesystem linked list, */ int32_t fs_unused_1; /* used for incore super blocks */ int32_t fs_sblkno; /* offset of super-block in filesys */ int32_t fs_cblkno; /* offset of cyl-block in filesys */ int32_t fs_iblkno; /* offset of inode-blocks in filesys */ int32_t fs_dblkno; /* offset of first data after cg */ int32_t fs_old_cgoffset; /* cylinder group offset in cylinder */ int32_t fs_old_cgmask; /* used to calc mod fs_ntrak */ int32_t fs_old_time; /* last time written */ int32_t fs_old_size; /* number of blocks in fs */ int32_t fs_old_dsize; /* number of data blocks in fs */ u_int32_t fs_ncg; /* number of cylinder groups */ int32_t fs_bsize; /* size of basic blocks in fs */ int32_t fs_fsize; /* size of frag blocks in fs */ int32_t fs_frag; /* number of frags in a block in fs */ /* these are configuration parameters */ int32_t fs_minfree; /* minimum percentage of free blocks */ int32_t fs_old_rotdelay; /* num of ms for optimal next block */ int32_t fs_old_rps; /* disk revolutions per second */ /* these fields can be computed from the others */ int32_t fs_bmask; /* ``blkoff'' calc of blk offsets */ int32_t fs_fmask; /* ``fragoff'' calc of frag offsets */ int32_t fs_bshift; /* ``lblkno'' calc of logical blkno */ int32_t fs_fshift; /* ``numfrags'' calc number of frags */ /* these are configuration parameters */ int32_t fs_maxcontig; /* max number of contiguous blks */ int32_t fs_maxbpg; /* max number of blks per cyl group */ /* these fields can be computed from the others */ int32_t fs_fragshift; /* block to frag shift */ int32_t fs_fsbtodb; /* fsbtodb and dbtofsb shift constant */ int32_t fs_sbsize; /* actual size of super block */ int32_t fs_spare1[2]; /* old fs_csmask */ /* old fs_csshift */ int32_t fs_nindir; /* value of NINDIR */ u_int32_t fs_inopb; /* value of INOPB */ int32_t fs_old_nspf; /* value of NSPF */ /* yet another configuration parameter */ int32_t fs_optim; /* optimization preference, see below */ int32_t fs_old_npsect; /* # sectors/track including spares */ int32_t fs_old_interleave; /* hardware sector interleave */ int32_t fs_old_trackskew; /* sector 0 skew, per track */ int32_t fs_id[2]; /* unique filesystem id */ /* sizes determined by number of cylinder groups and their sizes */ int32_t fs_old_csaddr; /* blk addr of cyl grp summary area */ int32_t fs_cssize; /* size of cyl grp summary area */ int32_t fs_cgsize; /* cylinder group size */ int32_t fs_spare2; /* old fs_ntrak */ int32_t fs_old_nsect; /* sectors per track */ int32_t fs_old_spc; /* sectors per cylinder */ int32_t fs_old_ncyl; /* cylinders in filesystem */ int32_t fs_old_cpg; /* cylinders per group */ u_int32_t fs_ipg; /* inodes per group */ int32_t fs_fpg; /* blocks per group * fs_frag */ /* this data must be re-computed after crashes */ struct csum fs_old_cstotal; /* cylinder summary information */ /* these fields are cleared at mount time */ int8_t fs_fmod; /* super block modified flag */ int8_t fs_clean; /* filesystem is clean flag */ int8_t fs_ronly; /* mounted read-only flag */ int8_t fs_old_flags; /* old FS_ flags */ u_char fs_fsmnt[MAXMNTLEN]; /* name mounted on */ u_char fs_volname[MAXVOLLEN]; /* volume name */ u_int64_t fs_swuid; /* system-wide uid */ int32_t fs_pad; /* due to alignment of fs_swuid */ /* these fields retain the current block allocation info */ int32_t fs_cgrotor; /* last cg searched */ void *fs_ocsp[NOCSPTRS]; /* padding; was list of fs_cs buffers */ u_int8_t *fs_contigdirs; /* (u) # of contig. allocated dirs */ struct csum *fs_csp; /* (u) cg summary info buffer */ int32_t *fs_maxcluster; /* (u) max cluster in each cyl group */ u_int *fs_active; /* (u) used by snapshots to track fs */ int32_t fs_old_cpc; /* cyl per cycle in postbl */ int32_t fs_maxbsize; /* maximum blocking factor permitted */ int64_t fs_unrefs; /* number of unreferenced inodes */ int64_t fs_providersize; /* size of underlying GEOM provider */ int64_t fs_metaspace; /* size of area reserved for metadata */ int64_t fs_sparecon64[14]; /* old rotation block list head */ int64_t fs_sblockloc; /* byte offset of standard superblock */ struct csum_total fs_cstotal; /* (u) cylinder summary information */ ufs_time_t fs_time; /* last time written */ int64_t fs_size; /* number of blocks in fs */ int64_t fs_dsize; /* number of data blocks in fs */ ufs2_daddr_t fs_csaddr; /* blk addr of cyl grp summary area */ int64_t fs_pendingblocks; /* (u) blocks being freed */ u_int32_t fs_pendinginodes; /* (u) inodes being freed */ uint32_t fs_snapinum[FSMAXSNAP];/* list of snapshot inode numbers */ u_int32_t fs_avgfilesize; /* expected average file size */ u_int32_t fs_avgfpdir; /* expected # of files per directory */ int32_t fs_save_cgsize; /* save real cg size to use fs_bsize */ ufs_time_t fs_mtime; /* Last mount or fsck time. */ int32_t fs_sujfree; /* SUJ free list */ int32_t fs_sparecon32[23]; /* reserved for future constants */ int32_t fs_flags; /* see FS_ flags below */ int32_t fs_contigsumsize; /* size of cluster summary array */ int32_t fs_maxsymlinklen; /* max length of an internal symlink */ int32_t fs_old_inodefmt; /* format of on-disk inodes */ u_int64_t fs_maxfilesize; /* maximum representable file size */ int64_t fs_qbmask; /* ~fs_bmask for use with 64-bit size */ int64_t fs_qfmask; /* ~fs_fmask for use with 64-bit size */ int32_t fs_state; /* validate fs_clean field */ int32_t fs_old_postblformat; /* format of positional layout tables */ int32_t fs_old_nrpos; /* number of rotational positions */ int32_t fs_spare5[2]; /* old fs_postbloff */ /* old fs_rotbloff */ int32_t fs_magic; /* magic number */ }; /* Sanity checking. */ #ifdef CTASSERT CTASSERT(sizeof(struct fs) == 1376); #endif /* * Filesystem identification */ #define FS_UFS1_MAGIC 0x011954 /* UFS1 fast filesystem magic number */ #define FS_UFS2_MAGIC 0x19540119 /* UFS2 fast filesystem magic number */ #define FS_BAD_MAGIC 0x19960408 /* UFS incomplete newfs magic number */ #define FS_OKAY 0x7c269d38 /* superblock checksum */ #define FS_42INODEFMT -1 /* 4.2BSD inode format */ #define FS_44INODEFMT 2 /* 4.4BSD inode format */ /* * Preference for optimization. */ #define FS_OPTTIME 0 /* minimize allocation time */ #define FS_OPTSPACE 1 /* minimize disk fragmentation */ /* * Filesystem flags. * * The FS_UNCLEAN flag is set by the kernel when the filesystem was * mounted with fs_clean set to zero. The FS_DOSOFTDEP flag indicates * that the filesystem should be managed by the soft updates code. * Note that the FS_NEEDSFSCK flag is set and cleared only by the * fsck utility. It is set when background fsck finds an unexpected * inconsistency which requires a traditional foreground fsck to be * run. Such inconsistencies should only be found after an uncorrectable * disk error. A foreground fsck will clear the FS_NEEDSFSCK flag when * it has successfully cleaned up the filesystem. The kernel uses this * flag to enforce that inconsistent filesystems be mounted read-only. * The FS_INDEXDIRS flag when set indicates that the kernel maintains * on-disk auxiliary indexes (such as B-trees) for speeding directory * accesses. Kernels that do not support auxiliary indices clear the * flag to indicate that the indices need to be rebuilt (by fsck) before * they can be used. * * FS_ACLS indicates that POSIX.1e ACLs are administratively enabled * for the file system, so they should be loaded from extended attributes, * observed for access control purposes, and be administered by object * owners. FS_NFS4ACLS indicates that NFSv4 ACLs are administratively * enabled. This flag is mutually exclusive with FS_ACLS. FS_MULTILABEL * indicates that the TrustedBSD MAC Framework should attempt to back MAC * labels into extended attributes on the file system rather than maintain * a single mount label for all objects. */ #define FS_UNCLEAN 0x0001 /* filesystem not clean at mount */ #define FS_DOSOFTDEP 0x0002 /* filesystem using soft dependencies */ #define FS_NEEDSFSCK 0x0004 /* filesystem needs sync fsck before mount */ #define FS_SUJ 0x0008 /* Filesystem using softupdate journal */ #define FS_ACLS 0x0010 /* file system has POSIX.1e ACLs enabled */ #define FS_MULTILABEL 0x0020 /* file system is MAC multi-label */ #define FS_GJOURNAL 0x0040 /* gjournaled file system */ #define FS_FLAGS_UPDATED 0x0080 /* flags have been moved to new location */ #define FS_NFS4ACLS 0x0100 /* file system has NFSv4 ACLs enabled */ #define FS_INDEXDIRS 0x0200 /* kernel supports indexed directories */ #define FS_TRIM 0x0400 /* issue BIO_DELETE for deleted blocks */ /* * Macros to access bits in the fs_active array. */ #define ACTIVECGNUM(fs, cg) ((fs)->fs_active[(cg) / (NBBY * sizeof(int))]) #define ACTIVECGOFF(cg) (1 << ((cg) % (NBBY * sizeof(int)))) #define ACTIVESET(fs, cg) do { \ if ((fs)->fs_active) \ ACTIVECGNUM((fs), (cg)) |= ACTIVECGOFF((cg)); \ } while (0) #define ACTIVECLEAR(fs, cg) do { \ if ((fs)->fs_active) \ ACTIVECGNUM((fs), (cg)) &= ~ACTIVECGOFF((cg)); \ } while (0) /* * The size of a cylinder group is calculated by CGSIZE. The maximum size * is limited by the fact that cylinder groups are at most one block. * Its size is derived from the size of the maps maintained in the * cylinder group and the (struct cg) size. */ #define CGSIZE(fs) \ /* base cg */ (sizeof(struct cg) + sizeof(int32_t) + \ /* old btotoff */ (fs)->fs_old_cpg * sizeof(int32_t) + \ /* old boff */ (fs)->fs_old_cpg * sizeof(u_int16_t) + \ /* inode map */ howmany((fs)->fs_ipg, NBBY) + \ /* block map */ howmany((fs)->fs_fpg, NBBY) +\ /* if present */ ((fs)->fs_contigsumsize <= 0 ? 0 : \ /* cluster sum */ (fs)->fs_contigsumsize * sizeof(int32_t) + \ /* cluster map */ howmany(fragstoblks(fs, (fs)->fs_fpg), NBBY))) /* * The minimal number of cylinder groups that should be created. */ #define MINCYLGRPS 4 /* * Convert cylinder group to base address of its global summary info. */ #define fs_cs(fs, indx) fs_csp[indx] /* * Cylinder group block for a filesystem. */ #define CG_MAGIC 0x090255 struct cg { int32_t cg_firstfield; /* historic cyl groups linked list */ int32_t cg_magic; /* magic number */ int32_t cg_old_time; /* time last written */ u_int32_t cg_cgx; /* we are the cgx'th cylinder group */ int16_t cg_old_ncyl; /* number of cyl's this cg */ int16_t cg_old_niblk; /* number of inode blocks this cg */ u_int32_t cg_ndblk; /* number of data blocks this cg */ struct csum cg_cs; /* cylinder summary information */ u_int32_t cg_rotor; /* position of last used block */ u_int32_t cg_frotor; /* position of last used frag */ u_int32_t cg_irotor; /* position of last used inode */ u_int32_t cg_frsum[MAXFRAG]; /* counts of available frags */ int32_t cg_old_btotoff; /* (int32) block totals per cylinder */ int32_t cg_old_boff; /* (u_int16) free block positions */ u_int32_t cg_iusedoff; /* (u_int8) used inode map */ u_int32_t cg_freeoff; /* (u_int8) free block map */ u_int32_t cg_nextfreeoff; /* (u_int8) next available space */ u_int32_t cg_clustersumoff; /* (u_int32) counts of avail clusters */ u_int32_t cg_clusteroff; /* (u_int8) free cluster map */ u_int32_t cg_nclusterblks; /* number of clusters this cg */ u_int32_t cg_niblk; /* number of inode blocks this cg */ u_int32_t cg_initediblk; /* last initialized inode */ u_int32_t cg_unrefs; /* number of unreferenced inodes */ int32_t cg_sparecon32[2]; /* reserved for future use */ ufs_time_t cg_time; /* time last written */ int64_t cg_sparecon64[3]; /* reserved for future use */ u_int8_t cg_space[1]; /* space for cylinder group maps */ /* actually longer */ }; /* * Macros for access to cylinder group array structures */ #define cg_chkmagic(cgp) ((cgp)->cg_magic == CG_MAGIC) #define cg_inosused(cgp) \ ((u_int8_t *)((u_int8_t *)(cgp) + (cgp)->cg_iusedoff)) #define cg_blksfree(cgp) \ ((u_int8_t *)((u_int8_t *)(cgp) + (cgp)->cg_freeoff)) #define cg_clustersfree(cgp) \ ((u_int8_t *)((u_int8_t *)(cgp) + (cgp)->cg_clusteroff)) #define cg_clustersum(cgp) \ ((int32_t *)((uintptr_t)(cgp) + (cgp)->cg_clustersumoff)) /* * Turn filesystem block numbers into disk block addresses. * This maps filesystem blocks to device size blocks. */ #define fsbtodb(fs, b) ((daddr_t)(b) << (fs)->fs_fsbtodb) #define dbtofsb(fs, b) ((b) >> (fs)->fs_fsbtodb) /* * Cylinder group macros to locate things in cylinder groups. * They calc filesystem addresses of cylinder group data structures. */ #define cgbase(fs, c) (((ufs2_daddr_t)(fs)->fs_fpg) * (c)) #define cgdata(fs, c) (cgdmin(fs, c) + (fs)->fs_metaspace) /* data zone */ #define cgmeta(fs, c) (cgdmin(fs, c)) /* meta data */ #define cgdmin(fs, c) (cgstart(fs, c) + (fs)->fs_dblkno) /* 1st data */ #define cgimin(fs, c) (cgstart(fs, c) + (fs)->fs_iblkno) /* inode blk */ #define cgsblock(fs, c) (cgstart(fs, c) + (fs)->fs_sblkno) /* super blk */ #define cgtod(fs, c) (cgstart(fs, c) + (fs)->fs_cblkno) /* cg block */ #define cgstart(fs, c) \ ((fs)->fs_magic == FS_UFS2_MAGIC ? cgbase(fs, c) : \ (cgbase(fs, c) + (fs)->fs_old_cgoffset * ((c) & ~((fs)->fs_old_cgmask)))) /* * Macros for handling inode numbers: * inode number to filesystem block offset. * inode number to cylinder group number. * inode number to filesystem block address. */ #define ino_to_cg(fs, x) (((ino_t)(x)) / (fs)->fs_ipg) #define ino_to_fsba(fs, x) \ ((ufs2_daddr_t)(cgimin(fs, ino_to_cg(fs, (ino_t)(x))) + \ (blkstofrags((fs), ((((ino_t)(x)) % (fs)->fs_ipg) / INOPB(fs)))))) #define ino_to_fsbo(fs, x) (((ino_t)(x)) % INOPB(fs)) /* * Give cylinder group number for a filesystem block. * Give cylinder group block number for a filesystem block. */ #define dtog(fs, d) ((d) / (fs)->fs_fpg) #define dtogd(fs, d) ((d) % (fs)->fs_fpg) /* * Extract the bits for a block from a map. * Compute the cylinder and rotational position of a cyl block addr. */ #define blkmap(fs, map, loc) \ (((map)[(loc) / NBBY] >> ((loc) % NBBY)) & (0xff >> (NBBY - (fs)->fs_frag))) /* * The following macros optimize certain frequently calculated * quantities by using shifts and masks in place of divisions * modulos and multiplications. */ #define blkoff(fs, loc) /* calculates (loc % fs->fs_bsize) */ \ ((loc) & (fs)->fs_qbmask) #define fragoff(fs, loc) /* calculates (loc % fs->fs_fsize) */ \ ((loc) & (fs)->fs_qfmask) #define lfragtosize(fs, frag) /* calculates ((off_t)frag * fs->fs_fsize) */ \ (((off_t)(frag)) << (fs)->fs_fshift) #define lblktosize(fs, blk) /* calculates ((off_t)blk * fs->fs_bsize) */ \ (((off_t)(blk)) << (fs)->fs_bshift) -/* Use this only when `blk' is known to be small, e.g., < NDADDR. */ +/* Use this only when `blk' is known to be small, e.g., < UFS_NDADDR. */ #define smalllblktosize(fs, blk) /* calculates (blk * fs->fs_bsize) */ \ ((blk) << (fs)->fs_bshift) #define lblkno(fs, loc) /* calculates (loc / fs->fs_bsize) */ \ ((loc) >> (fs)->fs_bshift) #define numfrags(fs, loc) /* calculates (loc / fs->fs_fsize) */ \ ((loc) >> (fs)->fs_fshift) #define blkroundup(fs, size) /* calculates roundup(size, fs->fs_bsize) */ \ (((size) + (fs)->fs_qbmask) & (fs)->fs_bmask) #define fragroundup(fs, size) /* calculates roundup(size, fs->fs_fsize) */ \ (((size) + (fs)->fs_qfmask) & (fs)->fs_fmask) #define fragstoblks(fs, frags) /* calculates (frags / fs->fs_frag) */ \ ((frags) >> (fs)->fs_fragshift) #define blkstofrags(fs, blks) /* calculates (blks * fs->fs_frag) */ \ ((blks) << (fs)->fs_fragshift) #define fragnum(fs, fsb) /* calculates (fsb % fs->fs_frag) */ \ ((fsb) & ((fs)->fs_frag - 1)) #define blknum(fs, fsb) /* calculates rounddown(fsb, fs->fs_frag) */ \ ((fsb) &~ ((fs)->fs_frag - 1)) /* * Determine the number of available frags given a * percentage to hold in reserve. */ #define freespace(fs, percentreserved) \ (blkstofrags((fs), (fs)->fs_cstotal.cs_nbfree) + \ (fs)->fs_cstotal.cs_nffree - \ (((off_t)((fs)->fs_dsize)) * (percentreserved) / 100)) /* * Determining the size of a file block in the filesystem. */ #define blksize(fs, ip, lbn) \ - (((lbn) >= NDADDR || (ip)->i_size >= smalllblktosize(fs, (lbn) + 1)) \ + (((lbn) >= UFS_NDADDR || (ip)->i_size >= smalllblktosize(fs, (lbn) + 1)) \ ? (fs)->fs_bsize \ : (fragroundup(fs, blkoff(fs, (ip)->i_size)))) #define sblksize(fs, size, lbn) \ - (((lbn) >= NDADDR || (size) >= ((lbn) + 1) << (fs)->fs_bshift) \ + (((lbn) >= UFS_NDADDR || (size) >= ((lbn) + 1) << (fs)->fs_bshift) \ ? (fs)->fs_bsize \ : (fragroundup(fs, blkoff(fs, (size))))) /* * Number of indirects in a filesystem block. */ #define NINDIR(fs) ((fs)->fs_nindir) /* - * Indirect lbns are aligned on NDADDR addresses where single indirects + * Indirect lbns are aligned on UFS_NDADDR addresses where single indirects * are the negated address of the lowest lbn reachable, double indirects * are this lbn - 1 and triple indirects are this lbn - 2. This yields * an unusual bit order to determine level. */ static inline int lbn_level(ufs_lbn_t lbn) { if (lbn >= 0) return 0; switch (lbn & 0x3) { case 0: return (0); case 1: break; case 2: return (2); case 3: return (1); default: break; } return (-1); } static inline ufs_lbn_t lbn_offset(struct fs *fs, int level) { ufs_lbn_t res; for (res = 1; level > 0; level--) res *= NINDIR(fs); return (res); } /* * Number of inodes in a secondary storage block/fragment. */ #define INOPB(fs) ((fs)->fs_inopb) #define INOPF(fs) ((fs)->fs_inopb >> (fs)->fs_fragshift) /* * Softdep journal record format. */ #define JOP_ADDREF 1 /* Add a reference to an inode. */ #define JOP_REMREF 2 /* Remove a reference from an inode. */ #define JOP_NEWBLK 3 /* Allocate a block. */ #define JOP_FREEBLK 4 /* Free a block or a tree of blocks. */ #define JOP_MVREF 5 /* Move a reference from one off to another. */ #define JOP_TRUNC 6 /* Partial truncation record. */ #define JOP_SYNC 7 /* fsync() complete record. */ #define JREC_SIZE 32 /* Record and segment header size. */ #define SUJ_MIN (4 * 1024 * 1024) /* Minimum journal size */ #define SUJ_MAX (32 * 1024 * 1024) /* Maximum journal size */ #define SUJ_FILE ".sujournal" /* Journal file name */ /* * Size of the segment record header. There is at most one for each disk * block in the journal. The segment header is followed by an array of * records. fsck depends on the first element in each record being 'op' * and the second being 'ino'. Segments may span multiple disk blocks but * the header is present on each. */ struct jsegrec { uint64_t jsr_seq; /* Our sequence number */ uint64_t jsr_oldest; /* Oldest valid sequence number */ uint16_t jsr_cnt; /* Count of valid records */ uint16_t jsr_blocks; /* Count of device bsize blocks. */ uint32_t jsr_crc; /* 32bit crc of the valid space */ ufs_time_t jsr_time; /* timestamp for mount instance */ }; /* * Reference record. Records a single link count modification. */ struct jrefrec { uint32_t jr_op; uint32_t jr_ino; uint32_t jr_parent; uint16_t jr_nlink; uint16_t jr_mode; int64_t jr_diroff; uint64_t jr_unused; }; /* * Move record. Records a reference moving within a directory block. The * nlink is unchanged but we must search both locations. */ struct jmvrec { uint32_t jm_op; uint32_t jm_ino; uint32_t jm_parent; uint16_t jm_unused; int64_t jm_oldoff; int64_t jm_newoff; }; /* * Block record. A set of frags or tree of blocks starting at an indirect are * freed or a set of frags are allocated. */ struct jblkrec { uint32_t jb_op; uint32_t jb_ino; ufs2_daddr_t jb_blkno; ufs_lbn_t jb_lbn; uint16_t jb_frags; uint16_t jb_oldfrags; uint32_t jb_unused; }; /* * Truncation record. Records a partial truncation so that it may be * completed at check time. Also used for sync records. */ struct jtrncrec { uint32_t jt_op; uint32_t jt_ino; int64_t jt_size; uint32_t jt_extsize; uint32_t jt_pad[3]; }; union jrec { struct jsegrec rec_jsegrec; struct jrefrec rec_jrefrec; struct jmvrec rec_jmvrec; struct jblkrec rec_jblkrec; struct jtrncrec rec_jtrncrec; }; #ifdef CTASSERT CTASSERT(sizeof(struct jsegrec) == JREC_SIZE); CTASSERT(sizeof(struct jrefrec) == JREC_SIZE); CTASSERT(sizeof(struct jmvrec) == JREC_SIZE); CTASSERT(sizeof(struct jblkrec) == JREC_SIZE); CTASSERT(sizeof(struct jtrncrec) == JREC_SIZE); CTASSERT(sizeof(union jrec) == JREC_SIZE); #endif extern int inside[], around[]; extern u_char *fragtbl[]; /* * IOCTLs used for filesystem write suspension. */ #define UFSSUSPEND _IOW('U', 1, fsid_t) #define UFSRESUME _IO('U', 2) #endif Index: head/sys/ufs/ufs/dinode.h =================================================================== --- head/sys/ufs/ufs/dinode.h (revision 313779) +++ head/sys/ufs/ufs/dinode.h (revision 313780) @@ -1,189 +1,189 @@ /*- * Copyright (c) 2002 Networks Associates Technology, Inc. * All rights reserved. * * This software was developed for the FreeBSD Project by Marshall * Kirk McKusick and Network Associates Laboratories, the Security * Research Division of Network Associates, Inc. under DARPA/SPAWAR * contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA CHATS * research program * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * Copyright (c) 1982, 1989, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The names of the authors may not be used to endorse or promote * products derived from this software without specific prior written * permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)dinode.h 8.3 (Berkeley) 1/21/94 * $FreeBSD$ */ #ifndef _UFS_UFS_DINODE_H_ #define _UFS_UFS_DINODE_H_ /* * The root inode is the root of the filesystem. Inode 0 can't be used for * normal purposes and historically bad blocks were linked to inode 1, thus * the root inode is 2. (Inode 1 is no longer used for this purpose, however * numerous dump tapes make this assumption, so we are stuck with it). */ -#define ROOTINO ((ino_t)2) +#define UFS_ROOTINO ((ino_t)2) /* * The Whiteout inode# is a dummy non-zero inode number which will * never be allocated to a real file. It is used as a place holder * in the directory entry which has been tagged as a DT_WHT entry. - * See the comments about ROOTINO above. + * See the comments about UFS_ROOTINO above. */ -#define WINO ((ino_t)1) +#define UFS_WINO ((ino_t)1) /* * The size of physical and logical block numbers and time fields in UFS. */ typedef int32_t ufs1_daddr_t; typedef int64_t ufs2_daddr_t; typedef int64_t ufs_lbn_t; typedef int64_t ufs_time_t; /* File permissions. */ #define IEXEC 0000100 /* Executable. */ #define IWRITE 0000200 /* Writeable. */ #define IREAD 0000400 /* Readable. */ #define ISVTX 0001000 /* Sticky bit. */ #define ISGID 0002000 /* Set-gid. */ #define ISUID 0004000 /* Set-uid. */ /* File types. */ #define IFMT 0170000 /* Mask of file type. */ #define IFIFO 0010000 /* Named pipe (fifo). */ #define IFCHR 0020000 /* Character device. */ #define IFDIR 0040000 /* Directory file. */ #define IFBLK 0060000 /* Block device. */ #define IFREG 0100000 /* Regular file. */ #define IFLNK 0120000 /* Symbolic link. */ #define IFSOCK 0140000 /* UNIX domain socket. */ #define IFWHT 0160000 /* Whiteout. */ /* * A dinode contains all the meta-data associated with a UFS2 file. * This structure defines the on-disk format of a dinode. Since * this structure describes an on-disk structure, all its fields * are defined by types with precise widths. */ -#define NXADDR 2 /* External addresses in inode. */ -#define NDADDR 12 /* Direct addresses in inode. */ -#define NIADDR 3 /* Indirect addresses in inode. */ +#define UFS_NXADDR 2 /* External addresses in inode. */ +#define UFS_NDADDR 12 /* Direct addresses in inode. */ +#define UFS_NIADDR 3 /* Indirect addresses in inode. */ struct ufs2_dinode { u_int16_t di_mode; /* 0: IFMT, permissions; see below. */ int16_t di_nlink; /* 2: File link count. */ u_int32_t di_uid; /* 4: File owner. */ u_int32_t di_gid; /* 8: File group. */ u_int32_t di_blksize; /* 12: Inode blocksize. */ u_int64_t di_size; /* 16: File byte count. */ u_int64_t di_blocks; /* 24: Blocks actually held. */ ufs_time_t di_atime; /* 32: Last access time. */ ufs_time_t di_mtime; /* 40: Last modified time. */ ufs_time_t di_ctime; /* 48: Last inode change time. */ ufs_time_t di_birthtime; /* 56: Inode creation time. */ int32_t di_mtimensec; /* 64: Last modified time. */ int32_t di_atimensec; /* 68: Last access time. */ int32_t di_ctimensec; /* 72: Last inode change time. */ int32_t di_birthnsec; /* 76: Inode creation time. */ u_int32_t di_gen; /* 80: Generation number. */ u_int32_t di_kernflags; /* 84: Kernel flags. */ u_int32_t di_flags; /* 88: Status flags (chflags). */ u_int32_t di_extsize; /* 92: External attributes size. */ - ufs2_daddr_t di_extb[NXADDR];/* 96: External attributes block. */ - ufs2_daddr_t di_db[NDADDR]; /* 112: Direct disk blocks. */ - ufs2_daddr_t di_ib[NIADDR]; /* 208: Indirect disk blocks. */ + ufs2_daddr_t di_extb[UFS_NXADDR];/* 96: External attributes block. */ + ufs2_daddr_t di_db[UFS_NDADDR]; /* 112: Direct disk blocks. */ + ufs2_daddr_t di_ib[UFS_NIADDR]; /* 208: Indirect disk blocks. */ u_int64_t di_modrev; /* 232: i_modrev for NFSv4 */ uint32_t di_freelink; /* 240: SUJ: Next unlinked inode. */ uint32_t di_spare[3]; /* 244: Reserved; currently unused */ }; /* * The di_db fields may be overlaid with other information for * file types that do not have associated disk storage. Block * and character devices overlay the first data block with their * dev_t value. Short symbolic links place their path in the * di_db area. */ #define di_rdev di_db[0] /* * A UFS1 dinode contains all the meta-data associated with a UFS1 file. * This structure defines the on-disk format of a UFS1 dinode. Since * this structure describes an on-disk structure, all its fields * are defined by types with precise widths. */ struct ufs1_dinode { u_int16_t di_mode; /* 0: IFMT, permissions; see below. */ int16_t di_nlink; /* 2: File link count. */ uint32_t di_freelink; /* 4: SUJ: Next unlinked inode. */ u_int64_t di_size; /* 8: File byte count. */ int32_t di_atime; /* 16: Last access time. */ int32_t di_atimensec; /* 20: Last access time. */ int32_t di_mtime; /* 24: Last modified time. */ int32_t di_mtimensec; /* 28: Last modified time. */ int32_t di_ctime; /* 32: Last inode change time. */ int32_t di_ctimensec; /* 36: Last inode change time. */ - ufs1_daddr_t di_db[NDADDR]; /* 40: Direct disk blocks. */ - ufs1_daddr_t di_ib[NIADDR]; /* 88: Indirect disk blocks. */ + ufs1_daddr_t di_db[UFS_NDADDR]; /* 40: Direct disk blocks. */ + ufs1_daddr_t di_ib[UFS_NIADDR]; /* 88: Indirect disk blocks. */ u_int32_t di_flags; /* 100: Status flags (chflags). */ u_int32_t di_blocks; /* 104: Blocks actually held. */ u_int32_t di_gen; /* 108: Generation number. */ u_int32_t di_uid; /* 112: File owner. */ u_int32_t di_gid; /* 116: File group. */ u_int64_t di_modrev; /* 120: i_modrev for NFSv4 */ }; #endif /* _UFS_UFS_DINODE_H_ */ Index: head/sys/ufs/ufs/ufs_bmap.c =================================================================== --- head/sys/ufs/ufs/ufs_bmap.c (revision 313779) +++ head/sys/ufs/ufs/ufs_bmap.c (revision 313780) @@ -1,384 +1,385 @@ /*- * Copyright (c) 1989, 1991, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ufs_bmap.c 8.7 (Berkeley) 3/21/95 */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * Bmap converts the logical block number of a file to its physical block * number on the disk. The conversion is done by using the logical block * number to index into the array of block pointers described by the dinode. */ int ufs_bmap(ap) struct vop_bmap_args /* { struct vnode *a_vp; daddr_t a_bn; struct bufobj **a_bop; daddr_t *a_bnp; int *a_runp; int *a_runb; } */ *ap; { ufs2_daddr_t blkno; int error; /* * Check for underlying vnode requests and ensure that logical * to physical mapping is requested. */ if (ap->a_bop != NULL) *ap->a_bop = &VFSTOUFS(ap->a_vp->v_mount)->um_devvp->v_bufobj; if (ap->a_bnp == NULL) return (0); error = ufs_bmaparray(ap->a_vp, ap->a_bn, &blkno, NULL, ap->a_runp, ap->a_runb); *ap->a_bnp = blkno; return (error); } /* * Indirect blocks are now on the vnode for the file. They are given negative * logical block numbers. Indirect blocks are addressed by the negative * address of the first data block to which they point. Double indirect blocks * are addressed by one less than the address of the first indirect block to * which they point. Triple indirect blocks are addressed by one less than * the address of the first double indirect block to which they point. * * ufs_bmaparray does the bmap conversion, and if requested returns the * array of logical blocks which must be traversed to get to a block. * Each entry contains the offset into that block that gets you to the * next block and the disk address of the block (if it is assigned). */ int ufs_bmaparray(vp, bn, bnp, nbp, runp, runb) struct vnode *vp; ufs2_daddr_t bn; ufs2_daddr_t *bnp; struct buf *nbp; int *runp; int *runb; { struct inode *ip; struct buf *bp; struct ufsmount *ump; struct mount *mp; - struct indir a[NIADDR+1], *ap; + struct indir a[UFS_NIADDR+1], *ap; ufs2_daddr_t daddr; ufs_lbn_t metalbn; int error, num, maxrun = 0; int *nump; ap = NULL; ip = VTOI(vp); mp = vp->v_mount; ump = VFSTOUFS(mp); if (runp) { maxrun = mp->mnt_iosize_max / mp->mnt_stat.f_iosize - 1; *runp = 0; } if (runb) { *runb = 0; } ap = a; nump = # error = ufs_getlbns(vp, bn, ap, nump); if (error) return (error); num = *nump; if (num == 0) { - if (bn >= 0 && bn < NDADDR) { + if (bn >= 0 && bn < UFS_NDADDR) { *bnp = blkptrtodb(ump, DIP(ip, i_db[bn])); - } else if (bn < 0 && bn >= -NXADDR) { + } else if (bn < 0 && bn >= -UFS_NXADDR) { *bnp = blkptrtodb(ump, ip->i_din2->di_extb[-1 - bn]); if (*bnp == 0) *bnp = -1; if (nbp == NULL) panic("ufs_bmaparray: mapping ext data"); nbp->b_xflags |= BX_ALTDATA; return (0); } else { panic("ufs_bmaparray: blkno out of range"); } /* * Since this is FFS independent code, we are out of * scope for the definitions of BLK_NOCOPY and * BLK_SNAP, but we do know that they will fall in * the range 1..um_seqinc, so we use that test and * return a request for a zeroed out buffer if attempts * are made to read a BLK_NOCOPY or BLK_SNAP block. */ if ((ip->i_flags & SF_SNAPSHOT) && DIP(ip, i_db[bn]) > 0 && DIP(ip, i_db[bn]) < ump->um_seqinc) { *bnp = -1; } else if (*bnp == 0) { if (ip->i_flags & SF_SNAPSHOT) *bnp = blkptrtodb(ump, bn * ump->um_seqinc); else *bnp = -1; } else if (runp) { ufs2_daddr_t bnb = bn; - for (++bn; bn < NDADDR && *runp < maxrun && + for (++bn; bn < UFS_NDADDR && *runp < maxrun && is_sequential(ump, DIP(ip, i_db[bn - 1]), DIP(ip, i_db[bn])); ++bn, ++*runp); bn = bnb; if (runb && (bn > 0)) { for (--bn; (bn >= 0) && (*runb < maxrun) && is_sequential(ump, DIP(ip, i_db[bn]), DIP(ip, i_db[bn+1])); --bn, ++*runb); } } return (0); } /* Get disk address out of indirect block array */ daddr = DIP(ip, i_ib[ap->in_off]); for (bp = NULL, ++ap; --num; ++ap) { /* * Exit the loop if there is no disk address assigned yet and * the indirect block isn't in the cache, or if we were * looking for an indirect block and we've found it. */ metalbn = ap->in_lbn; if ((daddr == 0 && !incore(&vp->v_bufobj, metalbn)) || metalbn == bn) break; /* * If we get here, we've either got the block in the cache * or we have a disk address for it, go fetch it. */ if (bp) bqrelse(bp); bp = getblk(vp, metalbn, mp->mnt_stat.f_iosize, 0, 0, 0); if ((bp->b_flags & B_CACHE) == 0) { #ifdef INVARIANTS if (!daddr) panic("ufs_bmaparray: indirect block not in cache"); #endif bp->b_blkno = blkptrtodb(ump, daddr); bp->b_iocmd = BIO_READ; bp->b_flags &= ~B_INVAL; bp->b_ioflags &= ~BIO_ERROR; vfs_busy_pages(bp, 0); bp->b_iooffset = dbtob(bp->b_blkno); bstrategy(bp); #ifdef RACCT if (racct_enable) { PROC_LOCK(curproc); racct_add_buf(curproc, bp, 0); PROC_UNLOCK(curproc); } #endif /* RACCT */ curthread->td_ru.ru_inblock++; error = bufwait(bp); if (error) { brelse(bp); return (error); } } if (I_IS_UFS1(ip)) { daddr = ((ufs1_daddr_t *)bp->b_data)[ap->in_off]; if (num == 1 && daddr && runp) { for (bn = ap->in_off + 1; bn < MNINDIR(ump) && *runp < maxrun && is_sequential(ump, ((ufs1_daddr_t *)bp->b_data)[bn - 1], ((ufs1_daddr_t *)bp->b_data)[bn]); ++bn, ++*runp); bn = ap->in_off; if (runb && bn) { for (--bn; bn >= 0 && *runb < maxrun && is_sequential(ump, ((ufs1_daddr_t *)bp->b_data)[bn], ((ufs1_daddr_t *)bp->b_data)[bn+1]); --bn, ++*runb); } } continue; } daddr = ((ufs2_daddr_t *)bp->b_data)[ap->in_off]; if (num == 1 && daddr && runp) { for (bn = ap->in_off + 1; bn < MNINDIR(ump) && *runp < maxrun && is_sequential(ump, ((ufs2_daddr_t *)bp->b_data)[bn - 1], ((ufs2_daddr_t *)bp->b_data)[bn]); ++bn, ++*runp); bn = ap->in_off; if (runb && bn) { for (--bn; bn >= 0 && *runb < maxrun && is_sequential(ump, ((ufs2_daddr_t *)bp->b_data)[bn], ((ufs2_daddr_t *)bp->b_data)[bn + 1]); --bn, ++*runb); } } } if (bp) bqrelse(bp); /* * Since this is FFS independent code, we are out of scope for the * definitions of BLK_NOCOPY and BLK_SNAP, but we do know that they * will fall in the range 1..um_seqinc, so we use that test and * return a request for a zeroed out buffer if attempts are made * to read a BLK_NOCOPY or BLK_SNAP block. */ if ((ip->i_flags & SF_SNAPSHOT) && daddr > 0 && daddr < ump->um_seqinc){ *bnp = -1; return (0); } *bnp = blkptrtodb(ump, daddr); if (*bnp == 0) { if (ip->i_flags & SF_SNAPSHOT) *bnp = blkptrtodb(ump, bn * ump->um_seqinc); else *bnp = -1; } return (0); } /* * Create an array of logical block number/offset pairs which represent the * path of indirect blocks required to access a data block. The first "pair" * contains the logical block number of the appropriate single, double or * triple indirect block and the offset into the inode indirect block array. * Note, the logical block number of the inode single/double/triple indirect * block appears twice in the array, once with the offset into the i_ib and * once with the offset into the page itself. */ int ufs_getlbns(vp, bn, ap, nump) struct vnode *vp; ufs2_daddr_t bn; struct indir *ap; int *nump; { ufs2_daddr_t blockcnt; ufs_lbn_t metalbn, realbn; struct ufsmount *ump; int i, numlevels, off; ump = VFSTOUFS(vp->v_mount); if (nump) *nump = 0; numlevels = 0; realbn = bn; if (bn < 0) bn = -bn; - /* The first NDADDR blocks are direct blocks. */ - if (bn < NDADDR) + /* The first UFS_NDADDR blocks are direct blocks. */ + if (bn < UFS_NDADDR) return (0); /* * Determine the number of levels of indirection. After this loop * is done, blockcnt indicates the number of data blocks possible - * at the previous level of indirection, and NIADDR - i is the number - * of levels of indirection needed to locate the requested block. + * at the previous level of indirection, and UFS_NIADDR - i is the + * number of levels of indirection needed to locate the requested block. */ - for (blockcnt = 1, i = NIADDR, bn -= NDADDR;; i--, bn -= blockcnt) { + for (blockcnt = 1, i = UFS_NIADDR, bn -= UFS_NDADDR; ; + i--, bn -= blockcnt) { if (i == 0) return (EFBIG); blockcnt *= MNINDIR(ump); if (bn < blockcnt) break; } /* Calculate the address of the first meta-block. */ if (realbn >= 0) - metalbn = -(realbn - bn + NIADDR - i); + metalbn = -(realbn - bn + UFS_NIADDR - i); else - metalbn = -(-realbn - bn + NIADDR - i); + metalbn = -(-realbn - bn + UFS_NIADDR - i); /* * At each iteration, off is the offset into the bap array which is * an array of disk addresses at the current level of indirection. * The logical block number and the offset in that block are stored * into the argument array. */ ap->in_lbn = metalbn; - ap->in_off = off = NIADDR - i; + ap->in_off = off = UFS_NIADDR - i; ap++; - for (++numlevels; i <= NIADDR; i++) { + for (++numlevels; i <= UFS_NIADDR; i++) { /* If searching for a meta-data block, quit when found. */ if (metalbn == realbn) break; blockcnt /= MNINDIR(ump); off = (bn / blockcnt) % MNINDIR(ump); ++numlevels; ap->in_lbn = metalbn; ap->in_off = off; ++ap; metalbn -= -1 + off * blockcnt; } if (nump) *nump = numlevels; return (0); } Index: head/sys/ufs/ufs/ufs_lookup.c =================================================================== --- head/sys/ufs/ufs/ufs_lookup.c (revision 313779) +++ head/sys/ufs/ufs/ufs_lookup.c (revision 313780) @@ -1,1485 +1,1485 @@ /*- * Copyright (c) 1989, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ufs_lookup.c 8.15 (Berkeley) 6/16/95 */ #include __FBSDID("$FreeBSD$"); #include "opt_ufs.h" #include "opt_quota.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef UFS_DIRHASH #include #endif #include #include #ifdef DIAGNOSTIC static int dirchk = 1; #else static int dirchk = 0; #endif SYSCTL_INT(_debug, OID_AUTO, dircheck, CTLFLAG_RW, &dirchk, 0, ""); /* true if old FS format...*/ #define OFSFMT(vp) ((vp)->v_mount->mnt_maxsymlinklen <= 0) static int ufs_delete_denied(struct vnode *vdp, struct vnode *tdp, struct ucred *cred, struct thread *td) { int error; #ifdef UFS_ACL /* * NFSv4 Minor Version 1, draft-ietf-nfsv4-minorversion1-03.txt * * 3.16.2.1. ACE4_DELETE vs. ACE4_DELETE_CHILD */ /* * XXX: Is this check required? */ error = VOP_ACCESS(vdp, VEXEC, cred, td); if (error) return (error); error = VOP_ACCESSX(tdp, VDELETE, cred, td); if (error == 0) return (0); error = VOP_ACCESSX(vdp, VDELETE_CHILD, cred, td); if (error == 0) return (0); error = VOP_ACCESSX(vdp, VEXPLICIT_DENY | VDELETE_CHILD, cred, td); if (error) return (error); #endif /* !UFS_ACL */ /* * Standard Unix access control - delete access requires VWRITE. */ error = VOP_ACCESS(vdp, VWRITE, cred, td); if (error) return (error); /* * If directory is "sticky", then user must own * the directory, or the file in it, else she * may not delete it (unless she's root). This * implements append-only directories. */ if ((VTOI(vdp)->i_mode & ISVTX) && VOP_ACCESS(vdp, VADMIN, cred, td) && VOP_ACCESS(tdp, VADMIN, cred, td)) return (EPERM); return (0); } /* * Convert a component of a pathname into a pointer to a locked inode. * This is a very central and rather complicated routine. * If the filesystem is not maintained in a strict tree hierarchy, * this can result in a deadlock situation (see comments in code below). * * The cnp->cn_nameiop argument is LOOKUP, CREATE, RENAME, or DELETE depending * on whether the name is to be looked up, created, renamed, or deleted. * When CREATE, RENAME, or DELETE is specified, information usable in * creating, renaming, or deleting a directory entry may be calculated. * If flag has LOCKPARENT or'ed into it and the target of the pathname * exists, lookup returns both the target and its parent directory locked. * When creating or renaming and LOCKPARENT is specified, the target may * not be ".". When deleting and LOCKPARENT is specified, the target may * be "."., but the caller must check to ensure it does an vrele and vput * instead of two vputs. * * This routine is actually used as VOP_CACHEDLOOKUP method, and the * filesystem employs the generic vfs_cache_lookup() as VOP_LOOKUP * method. * * vfs_cache_lookup() performs the following for us: * check that it is a directory * check accessibility of directory * check for modification attempts on read-only mounts * if name found in cache * if at end of path and deleting or creating * drop it * else * return name. * return VOP_CACHEDLOOKUP() * * Overall outline of ufs_lookup: * * search for name in directory, to found or notfound * notfound: * if creating, return locked directory, leaving info on available slots * else return error * found: * if at end of path and deleting, return information to allow delete * if at end of path and rewriting (RENAME and LOCKPARENT), lock target * inode and return info to allow rewrite * if not at end, add name to cache; if at end and neither creating * nor deleting, add name to cache */ int ufs_lookup(ap) struct vop_cachedlookup_args /* { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; } */ *ap; { return (ufs_lookup_ino(ap->a_dvp, ap->a_vpp, ap->a_cnp, NULL)); } int ufs_lookup_ino(struct vnode *vdp, struct vnode **vpp, struct componentname *cnp, ino_t *dd_ino) { struct inode *dp; /* inode for directory being searched */ struct buf *bp; /* a buffer of directory entries */ struct direct *ep; /* the current directory entry */ int entryoffsetinblock; /* offset of ep in bp's buffer */ enum {NONE, COMPACT, FOUND} slotstatus; doff_t slotoffset; /* offset of area with free space */ doff_t i_diroff; /* cached i_diroff value. */ doff_t i_offset; /* cached i_offset value. */ int slotsize; /* size of area at slotoffset */ int slotfreespace; /* amount of space free in slot */ int slotneeded; /* size of the entry we're seeking */ int numdirpasses; /* strategy for directory search */ doff_t endsearch; /* offset to end directory search */ doff_t prevoff; /* prev entry dp->i_offset */ struct vnode *pdp; /* saved dp during symlink work */ struct vnode *tdp; /* returned by VFS_VGET */ doff_t enduseful; /* pointer past last used dir slot */ u_long bmask; /* block offset mask */ int namlen, error; struct ucred *cred = cnp->cn_cred; int flags = cnp->cn_flags; int nameiop = cnp->cn_nameiop; ino_t ino, ino1; int ltype; if (vpp != NULL) *vpp = NULL; dp = VTOI(vdp); if (dp->i_effnlink == 0) return (ENOENT); /* * Create a vm object if vmiodirenable is enabled. * Alternatively we could call vnode_create_vobject * in VFS_VGET but we could end up creating objects * that are never used. */ vnode_create_vobject(vdp, DIP(dp, i_size), cnp->cn_thread); bmask = VFSTOUFS(vdp->v_mount)->um_mountp->mnt_stat.f_iosize - 1; #ifdef DEBUG_VFS_LOCKS /* * Assert that the directory vnode is locked, and locked * exclusively for the last component lookup for modifying * operations. * * The directory-modifying operations need to save * intermediate state in the inode between namei() call and * actual directory manipulations. See fields in the struct * inode marked as 'used during directory lookup'. We must * ensure that upgrade in namei() does not happen, since * upgrade might need to unlock vdp. If quotas are enabled, * getinoquota() also requires exclusive lock to modify inode. */ ASSERT_VOP_LOCKED(vdp, "ufs_lookup1"); if ((nameiop == CREATE || nameiop == DELETE || nameiop == RENAME) && (flags & (LOCKPARENT | ISLASTCN)) == (LOCKPARENT | ISLASTCN)) ASSERT_VOP_ELOCKED(vdp, "ufs_lookup2"); #endif restart: bp = NULL; slotoffset = -1; /* * We now have a segment name to search for, and a directory to search. * * Suppress search for slots unless creating * file and at end of pathname, in which case * we watch for a place to put the new file in * case it doesn't already exist. */ ino = 0; i_diroff = dp->i_diroff; slotstatus = FOUND; slotfreespace = slotsize = slotneeded = 0; if ((nameiop == CREATE || nameiop == RENAME) && (flags & ISLASTCN)) { slotstatus = NONE; slotneeded = DIRECTSIZ(cnp->cn_namelen); } #ifdef UFS_DIRHASH /* * Use dirhash for fast operations on large directories. The logic * to determine whether to hash the directory is contained within * ufsdirhash_build(); a zero return means that it decided to hash * this directory and it successfully built up the hash table. */ if (ufsdirhash_build(dp) == 0) { /* Look for a free slot if needed. */ enduseful = dp->i_size; if (slotstatus != FOUND) { slotoffset = ufsdirhash_findfree(dp, slotneeded, &slotsize); if (slotoffset >= 0) { slotstatus = COMPACT; enduseful = ufsdirhash_enduseful(dp); if (enduseful < 0) enduseful = dp->i_size; } } /* Look up the component. */ numdirpasses = 1; entryoffsetinblock = 0; /* silence compiler warning */ switch (ufsdirhash_lookup(dp, cnp->cn_nameptr, cnp->cn_namelen, &i_offset, &bp, nameiop == DELETE ? &prevoff : NULL)) { case 0: ep = (struct direct *)((char *)bp->b_data + (i_offset & bmask)); goto foundentry; case ENOENT: i_offset = roundup2(dp->i_size, DIRBLKSIZ); goto notfound; default: /* Something failed; just do a linear search. */ break; } } #endif /* UFS_DIRHASH */ /* * If there is cached information on a previous search of * this directory, pick up where we last left off. * We cache only lookups as these are the most common * and have the greatest payoff. Caching CREATE has little * benefit as it usually must search the entire directory * to determine that the entry does not exist. Caching the * location of the last DELETE or RENAME has not reduced * profiling time and hence has been removed in the interest * of simplicity. */ if (nameiop != LOOKUP || i_diroff == 0 || i_diroff >= dp->i_size) { entryoffsetinblock = 0; i_offset = 0; numdirpasses = 1; } else { i_offset = i_diroff; if ((entryoffsetinblock = i_offset & bmask) && (error = UFS_BLKATOFF(vdp, (off_t)i_offset, NULL, &bp))) return (error); numdirpasses = 2; nchstats.ncs_2passes++; } prevoff = i_offset; endsearch = roundup2(dp->i_size, DIRBLKSIZ); enduseful = 0; searchloop: while (i_offset < endsearch) { /* * If necessary, get the next directory block. */ if ((i_offset & bmask) == 0) { if (bp != NULL) brelse(bp); error = UFS_BLKATOFF(vdp, (off_t)i_offset, NULL, &bp); if (error) return (error); entryoffsetinblock = 0; } /* * If still looking for a slot, and at a DIRBLKSIZE * boundary, have to start looking for free space again. */ if (slotstatus == NONE && (entryoffsetinblock & (DIRBLKSIZ - 1)) == 0) { slotoffset = -1; slotfreespace = 0; } /* * Get pointer to next entry. * Full validation checks are slow, so we only check * enough to insure forward progress through the * directory. Complete checks can be run by patching * "dirchk" to be true. */ ep = (struct direct *)((char *)bp->b_data + entryoffsetinblock); if (ep->d_reclen == 0 || ep->d_reclen > DIRBLKSIZ - (entryoffsetinblock & (DIRBLKSIZ - 1)) || (dirchk && ufs_dirbadentry(vdp, ep, entryoffsetinblock))) { int i; ufs_dirbad(dp, i_offset, "mangled entry"); i = DIRBLKSIZ - (entryoffsetinblock & (DIRBLKSIZ - 1)); i_offset += i; entryoffsetinblock += i; continue; } /* * If an appropriate sized slot has not yet been found, * check to see if one is available. Also accumulate space * in the current block so that we can determine if * compaction is viable. */ if (slotstatus != FOUND) { int size = ep->d_reclen; if (ep->d_ino != 0) size -= DIRSIZ(OFSFMT(vdp), ep); if (size > 0) { if (size >= slotneeded) { slotstatus = FOUND; slotoffset = i_offset; slotsize = ep->d_reclen; } else if (slotstatus == NONE) { slotfreespace += size; if (slotoffset == -1) slotoffset = i_offset; if (slotfreespace >= slotneeded) { slotstatus = COMPACT; slotsize = i_offset + ep->d_reclen - slotoffset; } } } } /* * Check for a name match. */ if (ep->d_ino) { # if (BYTE_ORDER == LITTLE_ENDIAN) if (OFSFMT(vdp)) namlen = ep->d_type; else namlen = ep->d_namlen; # else namlen = ep->d_namlen; # endif if (namlen == cnp->cn_namelen && (cnp->cn_nameptr[0] == ep->d_name[0]) && !bcmp(cnp->cn_nameptr, ep->d_name, (unsigned)namlen)) { #ifdef UFS_DIRHASH foundentry: #endif /* * Save directory entry's inode number and * reclen in ndp->ni_ufs area, and release * directory buffer. */ if (vdp->v_mount->mnt_maxsymlinklen > 0 && ep->d_type == DT_WHT) { slotstatus = FOUND; slotoffset = i_offset; slotsize = ep->d_reclen; enduseful = dp->i_size; cnp->cn_flags |= ISWHITEOUT; numdirpasses--; goto notfound; } ino = ep->d_ino; goto found; } } prevoff = i_offset; i_offset += ep->d_reclen; entryoffsetinblock += ep->d_reclen; if (ep->d_ino) enduseful = i_offset; } notfound: /* * If we started in the middle of the directory and failed * to find our target, we must check the beginning as well. */ if (numdirpasses == 2) { numdirpasses--; i_offset = 0; endsearch = i_diroff; goto searchloop; } if (bp != NULL) brelse(bp); /* * If creating, and at end of pathname and current * directory has not been removed, then can consider * allowing file to be created. */ if ((nameiop == CREATE || nameiop == RENAME || (nameiop == DELETE && (cnp->cn_flags & DOWHITEOUT) && (cnp->cn_flags & ISWHITEOUT))) && (flags & ISLASTCN) && dp->i_effnlink != 0) { /* * Access for write is interpreted as allowing * creation of files in the directory. * * XXX: Fix the comment above. */ if (flags & WILLBEDIR) error = VOP_ACCESSX(vdp, VWRITE | VAPPEND, cred, cnp->cn_thread); else error = VOP_ACCESS(vdp, VWRITE, cred, cnp->cn_thread); if (error) return (error); /* * Return an indication of where the new directory * entry should be put. If we didn't find a slot, * then set dp->i_count to 0 indicating * that the new slot belongs at the end of the * directory. If we found a slot, then the new entry * can be put in the range from dp->i_offset to * dp->i_offset + dp->i_count. */ if (slotstatus == NONE) { dp->i_offset = roundup2(dp->i_size, DIRBLKSIZ); dp->i_count = 0; enduseful = dp->i_offset; } else if (nameiop == DELETE) { dp->i_offset = slotoffset; if ((dp->i_offset & (DIRBLKSIZ - 1)) == 0) dp->i_count = 0; else dp->i_count = dp->i_offset - prevoff; } else { dp->i_offset = slotoffset; dp->i_count = slotsize; if (enduseful < slotoffset + slotsize) enduseful = slotoffset + slotsize; } dp->i_endoff = roundup2(enduseful, DIRBLKSIZ); /* * We return with the directory locked, so that * the parameters we set up above will still be * valid if we actually decide to do a direnter(). * We return ni_vp == NULL to indicate that the entry * does not currently exist; we leave a pointer to * the (locked) directory inode in ndp->ni_dvp. * The pathname buffer is saved so that the name * can be obtained later. * * NB - if the directory is unlocked, then this * information cannot be used. */ cnp->cn_flags |= SAVENAME; return (EJUSTRETURN); } /* * Insert name into cache (as non-existent) if appropriate. */ if ((cnp->cn_flags & MAKEENTRY) != 0) cache_enter(vdp, NULL, cnp); return (ENOENT); found: if (dd_ino != NULL) *dd_ino = ino; if (numdirpasses == 2) nchstats.ncs_pass2++; /* * Check that directory length properly reflects presence * of this entry. */ if (i_offset + DIRSIZ(OFSFMT(vdp), ep) > dp->i_size) { ufs_dirbad(dp, i_offset, "i_size too small"); dp->i_size = i_offset + DIRSIZ(OFSFMT(vdp), ep); DIP_SET(dp, i_size, dp->i_size); dp->i_flag |= IN_CHANGE | IN_UPDATE; } brelse(bp); /* * Found component in pathname. * If the final component of path name, save information * in the cache as to where the entry was found. */ if ((flags & ISLASTCN) && nameiop == LOOKUP) dp->i_diroff = rounddown2(i_offset, DIRBLKSIZ); /* * If deleting, and at end of pathname, return * parameters which can be used to remove file. */ if (nameiop == DELETE && (flags & ISLASTCN)) { if (flags & LOCKPARENT) ASSERT_VOP_ELOCKED(vdp, __FUNCTION__); /* * Return pointer to current entry in dp->i_offset, * and distance past previous entry (if there * is a previous entry in this block) in dp->i_count. * Save directory inode pointer in ndp->ni_dvp for dirremove(). * * Technically we shouldn't be setting these in the * WANTPARENT case (first lookup in rename()), but any * lookups that will result in directory changes will * overwrite these. */ dp->i_offset = i_offset; if ((dp->i_offset & (DIRBLKSIZ - 1)) == 0) dp->i_count = 0; else dp->i_count = dp->i_offset - prevoff; if (dd_ino != NULL) return (0); if ((error = VFS_VGET(vdp->v_mount, ino, LK_EXCLUSIVE, &tdp)) != 0) return (error); error = ufs_delete_denied(vdp, tdp, cred, cnp->cn_thread); if (error) { vput(tdp); return (error); } if (dp->i_number == ino) { VREF(vdp); *vpp = vdp; vput(tdp); return (0); } *vpp = tdp; return (0); } /* * If rewriting (RENAME), return the inode and the * information required to rewrite the present directory * Must get inode of directory entry to verify it's a * regular file, or empty directory. */ if (nameiop == RENAME && (flags & ISLASTCN)) { if (flags & WILLBEDIR) error = VOP_ACCESSX(vdp, VWRITE | VAPPEND, cred, cnp->cn_thread); else error = VOP_ACCESS(vdp, VWRITE, cred, cnp->cn_thread); if (error) return (error); /* * Careful about locking second inode. * This can only occur if the target is ".". */ dp->i_offset = i_offset; if (dp->i_number == ino) return (EISDIR); if (dd_ino != NULL) return (0); if ((error = VFS_VGET(vdp->v_mount, ino, LK_EXCLUSIVE, &tdp)) != 0) return (error); error = ufs_delete_denied(vdp, tdp, cred, cnp->cn_thread); if (error) { vput(tdp); return (error); } #ifdef SunOS_doesnt_do_that /* * The only purpose of this check is to return the correct * error. Assume that we want to rename directory "a" * to a file "b", and that we have no ACL_WRITE_DATA on * a containing directory, but we _do_ have ACL_APPEND_DATA. * In that case, the VOP_ACCESS check above will return 0, * and the operation will fail with ENOTDIR instead * of EACCESS. */ if (tdp->v_type == VDIR) error = VOP_ACCESSX(vdp, VWRITE | VAPPEND, cred, cnp->cn_thread); else error = VOP_ACCESS(vdp, VWRITE, cred, cnp->cn_thread); if (error) { vput(tdp); return (error); } #endif *vpp = tdp; cnp->cn_flags |= SAVENAME; return (0); } if (dd_ino != NULL) return (0); /* * Step through the translation in the name. We do not `vput' the * directory because we may need it again if a symbolic link * is relative to the current directory. Instead we save it * unlocked as "pdp". We must get the target inode before unlocking * the directory to insure that the inode will not be removed * before we get it. We prevent deadlock by always fetching * inodes from the root, moving down the directory tree. Thus * when following backward pointers ".." we must unlock the * parent directory before getting the requested directory. * There is a potential race condition here if both the current * and parent directories are removed before the VFS_VGET for the * inode associated with ".." returns. We hope that this occurs * infrequently since we cannot avoid this race condition without * implementing a sophisticated deadlock detection algorithm. * Note also that this simple deadlock detection scheme will not * work if the filesystem has any hard links other than ".." * that point backwards in the directory structure. */ pdp = vdp; if (flags & ISDOTDOT) { error = vn_vget_ino(pdp, ino, cnp->cn_lkflags, &tdp); if (error) return (error); /* * Recheck that ".." entry in the vdp directory points * to the inode we looked up before vdp lock was * dropped. */ error = ufs_lookup_ino(pdp, NULL, cnp, &ino1); if (error) { vput(tdp); return (error); } if (ino1 != ino) { vput(tdp); goto restart; } *vpp = tdp; } else if (dp->i_number == ino) { VREF(vdp); /* we want ourself, ie "." */ /* * When we lookup "." we still can be asked to lock it * differently. */ ltype = cnp->cn_lkflags & LK_TYPE_MASK; if (ltype != VOP_ISLOCKED(vdp)) { if (ltype == LK_EXCLUSIVE) vn_lock(vdp, LK_UPGRADE | LK_RETRY); else /* if (ltype == LK_SHARED) */ vn_lock(vdp, LK_DOWNGRADE | LK_RETRY); /* * Relock for the "." case may left us with * reclaimed vnode. */ if (vdp->v_iflag & VI_DOOMED) { vrele(vdp); return (ENOENT); } } *vpp = vdp; } else { error = VFS_VGET(pdp->v_mount, ino, cnp->cn_lkflags, &tdp); if (error) return (error); *vpp = tdp; } /* * Insert name into cache if appropriate. */ if (cnp->cn_flags & MAKEENTRY) cache_enter(vdp, *vpp, cnp); return (0); } void ufs_dirbad(ip, offset, how) struct inode *ip; doff_t offset; char *how; { struct mount *mp; mp = ITOV(ip)->v_mount; if ((mp->mnt_flag & MNT_RDONLY) == 0) panic("ufs_dirbad: %s: bad dir ino %ju at offset %ld: %s", mp->mnt_stat.f_mntonname, (uintmax_t)ip->i_number, (long)offset, how); else (void)printf("%s: bad dir ino %ju at offset %ld: %s\n", mp->mnt_stat.f_mntonname, (uintmax_t)ip->i_number, (long)offset, how); } /* * Do consistency checking on a directory entry: * record length must be multiple of 4 * entry must fit in rest of its DIRBLKSIZ block * record must be large enough to contain entry * name is not longer than UFS_MAXNAMLEN * name must be as long as advertised, and null terminated */ int ufs_dirbadentry(dp, ep, entryoffsetinblock) struct vnode *dp; struct direct *ep; int entryoffsetinblock; { int i, namlen; # if (BYTE_ORDER == LITTLE_ENDIAN) if (OFSFMT(dp)) namlen = ep->d_type; else namlen = ep->d_namlen; # else namlen = ep->d_namlen; # endif if ((ep->d_reclen & 0x3) != 0 || ep->d_reclen > DIRBLKSIZ - (entryoffsetinblock & (DIRBLKSIZ - 1)) || ep->d_reclen < DIRSIZ(OFSFMT(dp), ep) || namlen > UFS_MAXNAMLEN) { /*return (1); */ printf("First bad\n"); goto bad; } if (ep->d_ino == 0) return (0); for (i = 0; i < namlen; i++) if (ep->d_name[i] == '\0') { /*return (1); */ printf("Second bad\n"); goto bad; } if (ep->d_name[i]) goto bad; return (0); bad: return (1); } /* * Construct a new directory entry after a call to namei, using the * parameters that it left in the componentname argument cnp. The * argument ip is the inode to which the new directory entry will refer. */ void ufs_makedirentry(ip, cnp, newdirp) struct inode *ip; struct componentname *cnp; struct direct *newdirp; { #ifdef INVARIANTS if ((cnp->cn_flags & SAVENAME) == 0) panic("ufs_makedirentry: missing name"); #endif newdirp->d_ino = ip->i_number; newdirp->d_namlen = cnp->cn_namelen; bcopy(cnp->cn_nameptr, newdirp->d_name, (unsigned)cnp->cn_namelen + 1); if (ITOV(ip)->v_mount->mnt_maxsymlinklen > 0) newdirp->d_type = IFTODT(ip->i_mode); else { newdirp->d_type = 0; # if (BYTE_ORDER == LITTLE_ENDIAN) { u_char tmp = newdirp->d_namlen; newdirp->d_namlen = newdirp->d_type; newdirp->d_type = tmp; } # endif } } /* * Write a directory entry after a call to namei, using the parameters * that it left in nameidata. The argument dirp is the new directory * entry contents. Dvp is a pointer to the directory to be written, * which was left locked by namei. Remaining parameters (dp->i_offset, * dp->i_count) indicate how the space for the new entry is to be obtained. * Non-null bp indicates that a directory is being created (for the * soft dependency code). */ int ufs_direnter(dvp, tvp, dirp, cnp, newdirbp, isrename) struct vnode *dvp; struct vnode *tvp; struct direct *dirp; struct componentname *cnp; struct buf *newdirbp; int isrename; { struct ucred *cr; struct thread *td; int newentrysize; struct inode *dp; struct buf *bp; u_int dsize; struct direct *ep, *nep; u_int64_t old_isize; int error, ret, blkoff, loc, spacefree, flags, namlen; char *dirbuf; td = curthread; /* XXX */ cr = td->td_ucred; dp = VTOI(dvp); newentrysize = DIRSIZ(OFSFMT(dvp), dirp); if (dp->i_count == 0) { /* * If dp->i_count is 0, then namei could find no * space in the directory. Here, dp->i_offset will * be on a directory block boundary and we will write the * new entry into a fresh block. */ if (dp->i_offset & (DIRBLKSIZ - 1)) panic("ufs_direnter: newblk"); flags = BA_CLRBUF; if (!DOINGSOFTDEP(dvp) && !DOINGASYNC(dvp)) flags |= IO_SYNC; #ifdef QUOTA if ((error = getinoquota(dp)) != 0) { if (DOINGSOFTDEP(dvp) && newdirbp != NULL) bdwrite(newdirbp); return (error); } #endif old_isize = dp->i_size; vnode_pager_setsize(dvp, (u_long)dp->i_offset + DIRBLKSIZ); if ((error = UFS_BALLOC(dvp, (off_t)dp->i_offset, DIRBLKSIZ, cr, flags, &bp)) != 0) { if (DOINGSOFTDEP(dvp) && newdirbp != NULL) bdwrite(newdirbp); vnode_pager_setsize(dvp, (u_long)old_isize); return (error); } dp->i_size = dp->i_offset + DIRBLKSIZ; DIP_SET(dp, i_size, dp->i_size); dp->i_endoff = dp->i_size; dp->i_flag |= IN_CHANGE | IN_UPDATE; dirp->d_reclen = DIRBLKSIZ; blkoff = dp->i_offset & (VFSTOUFS(dvp->v_mount)->um_mountp->mnt_stat.f_iosize - 1); bcopy((caddr_t)dirp, (caddr_t)bp->b_data + blkoff,newentrysize); #ifdef UFS_DIRHASH if (dp->i_dirhash != NULL) { ufsdirhash_newblk(dp, dp->i_offset); ufsdirhash_add(dp, dirp, dp->i_offset); ufsdirhash_checkblock(dp, (char *)bp->b_data + blkoff, dp->i_offset); } #endif if (DOINGSOFTDEP(dvp)) { /* * Ensure that the entire newly allocated block is a * valid directory so that future growth within the * block does not have to ensure that the block is * written before the inode. */ blkoff += DIRBLKSIZ; while (blkoff < bp->b_bcount) { ((struct direct *) (bp->b_data + blkoff))->d_reclen = DIRBLKSIZ; blkoff += DIRBLKSIZ; } if (softdep_setup_directory_add(bp, dp, dp->i_offset, dirp->d_ino, newdirbp, 1)) dp->i_flag |= IN_NEEDSYNC; if (newdirbp) bdwrite(newdirbp); bdwrite(bp); if ((dp->i_flag & IN_NEEDSYNC) == 0) return (UFS_UPDATE(dvp, 0)); /* * We have just allocated a directory block in an * indirect block. We must prevent holes in the * directory created if directory entries are * written out of order. To accomplish this we * fsync when we extend a directory into indirects. * During rename it's not safe to drop the tvp lock * so sync must be delayed until it is. * * This synchronous step could be removed if fsck and * the kernel were taught to fill in sparse * directories rather than panic. */ if (isrename) return (0); if (tvp != NULL) VOP_UNLOCK(tvp, 0); (void) VOP_FSYNC(dvp, MNT_WAIT, td); if (tvp != NULL) vn_lock(tvp, LK_EXCLUSIVE | LK_RETRY); return (error); } if (DOINGASYNC(dvp)) { bdwrite(bp); return (UFS_UPDATE(dvp, 0)); } error = bwrite(bp); ret = UFS_UPDATE(dvp, 1); if (error == 0) return (ret); return (error); } /* * If dp->i_count is non-zero, then namei found space for the new * entry in the range dp->i_offset to dp->i_offset + dp->i_count * in the directory. To use this space, we may have to compact * the entries located there, by copying them together towards the * beginning of the block, leaving the free space in one usable * chunk at the end. */ /* * Increase size of directory if entry eats into new space. * This should never push the size past a new multiple of * DIRBLKSIZE. * * N.B. - THIS IS AN ARTIFACT OF 4.2 AND SHOULD NEVER HAPPEN. */ if (dp->i_offset + dp->i_count > dp->i_size) { dp->i_size = dp->i_offset + dp->i_count; DIP_SET(dp, i_size, dp->i_size); } /* * Get the block containing the space for the new directory entry. */ error = UFS_BLKATOFF(dvp, (off_t)dp->i_offset, &dirbuf, &bp); if (error) { if (DOINGSOFTDEP(dvp) && newdirbp != NULL) bdwrite(newdirbp); return (error); } /* * Find space for the new entry. In the simple case, the entry at * offset base will have the space. If it does not, then namei * arranged that compacting the region dp->i_offset to * dp->i_offset + dp->i_count would yield the space. */ ep = (struct direct *)dirbuf; dsize = ep->d_ino ? DIRSIZ(OFSFMT(dvp), ep) : 0; spacefree = ep->d_reclen - dsize; for (loc = ep->d_reclen; loc < dp->i_count; ) { nep = (struct direct *)(dirbuf + loc); /* Trim the existing slot (NB: dsize may be zero). */ ep->d_reclen = dsize; ep = (struct direct *)((char *)ep + dsize); /* Read nep->d_reclen now as the bcopy() may clobber it. */ loc += nep->d_reclen; if (nep->d_ino == 0) { /* * A mid-block unused entry. Such entries are * never created by the kernel, but fsck_ffs * can create them (and it doesn't fix them). * * Add up the free space, and initialise the * relocated entry since we don't bcopy it. */ spacefree += nep->d_reclen; ep->d_ino = 0; dsize = 0; continue; } dsize = DIRSIZ(OFSFMT(dvp), nep); spacefree += nep->d_reclen - dsize; #ifdef UFS_DIRHASH if (dp->i_dirhash != NULL) ufsdirhash_move(dp, nep, dp->i_offset + ((char *)nep - dirbuf), dp->i_offset + ((char *)ep - dirbuf)); #endif if (DOINGSOFTDEP(dvp)) softdep_change_directoryentry_offset(bp, dp, dirbuf, (caddr_t)nep, (caddr_t)ep, dsize); else bcopy((caddr_t)nep, (caddr_t)ep, dsize); } /* * Here, `ep' points to a directory entry containing `dsize' in-use * bytes followed by `spacefree' unused bytes. If ep->d_ino == 0, * then the entry is completely unused (dsize == 0). The value * of ep->d_reclen is always indeterminate. * * Update the pointer fields in the previous entry (if any), * copy in the new entry, and write out the block. */ # if (BYTE_ORDER == LITTLE_ENDIAN) if (OFSFMT(dvp)) namlen = ep->d_type; else namlen = ep->d_namlen; # else namlen = ep->d_namlen; # endif if (ep->d_ino == 0 || - (ep->d_ino == WINO && namlen == dirp->d_namlen && + (ep->d_ino == UFS_WINO && namlen == dirp->d_namlen && bcmp(ep->d_name, dirp->d_name, dirp->d_namlen) == 0)) { if (spacefree + dsize < newentrysize) panic("ufs_direnter: compact1"); dirp->d_reclen = spacefree + dsize; } else { if (spacefree < newentrysize) panic("ufs_direnter: compact2"); dirp->d_reclen = spacefree; ep->d_reclen = dsize; ep = (struct direct *)((char *)ep + dsize); } #ifdef UFS_DIRHASH if (dp->i_dirhash != NULL && (ep->d_ino == 0 || dirp->d_reclen == spacefree)) ufsdirhash_add(dp, dirp, dp->i_offset + ((char *)ep - dirbuf)); #endif bcopy((caddr_t)dirp, (caddr_t)ep, (u_int)newentrysize); #ifdef UFS_DIRHASH if (dp->i_dirhash != NULL) ufsdirhash_checkblock(dp, dirbuf - (dp->i_offset & (DIRBLKSIZ - 1)), rounddown2(dp->i_offset, DIRBLKSIZ)); #endif if (DOINGSOFTDEP(dvp)) { (void) softdep_setup_directory_add(bp, dp, dp->i_offset + (caddr_t)ep - dirbuf, dirp->d_ino, newdirbp, 0); if (newdirbp != NULL) bdwrite(newdirbp); bdwrite(bp); } else { if (DOINGASYNC(dvp)) { bdwrite(bp); error = 0; } else { error = bwrite(bp); } } dp->i_flag |= IN_CHANGE | IN_UPDATE; /* * If all went well, and the directory can be shortened, proceed * with the truncation. Note that we have to unlock the inode for * the entry that we just entered, as the truncation may need to * lock other inodes which can lead to deadlock if we also hold a * lock on the newly entered node. */ if (isrename == 0 && error == 0 && dp->i_endoff && dp->i_endoff < dp->i_size) { if (tvp != NULL) VOP_UNLOCK(tvp, 0); error = UFS_TRUNCATE(dvp, (off_t)dp->i_endoff, IO_NORMAL | (DOINGASYNC(dvp) ? 0 : IO_SYNC), cr); if (error != 0) vn_printf(dvp, "ufs_direnter: failed to truncate " "err %d", error); #ifdef UFS_DIRHASH if (error == 0 && dp->i_dirhash != NULL) ufsdirhash_dirtrunc(dp, dp->i_endoff); #endif error = 0; if (tvp != NULL) vn_lock(tvp, LK_EXCLUSIVE | LK_RETRY); } return (error); } /* * Remove a directory entry after a call to namei, using * the parameters which it left in nameidata. The entry * dp->i_offset contains the offset into the directory of the * entry to be eliminated. The dp->i_count field contains the * size of the previous record in the directory. If this * is 0, the first entry is being deleted, so we need only * zero the inode number to mark the entry as free. If the * entry is not the first in the directory, we must reclaim * the space of the now empty record by adding the record size * to the size of the previous entry. */ int ufs_dirremove(dvp, ip, flags, isrmdir) struct vnode *dvp; struct inode *ip; int flags; int isrmdir; { struct inode *dp; struct direct *ep, *rep; struct buf *bp; int error; dp = VTOI(dvp); /* * Adjust the link count early so softdep can block if necessary. */ if (ip) { ip->i_effnlink--; if (DOINGSOFTDEP(dvp)) { softdep_setup_unlink(dp, ip); } else { ip->i_nlink--; DIP_SET(ip, i_nlink, ip->i_nlink); ip->i_flag |= IN_CHANGE; } } if (flags & DOWHITEOUT) { /* - * Whiteout entry: set d_ino to WINO. + * Whiteout entry: set d_ino to UFS_WINO. */ if ((error = UFS_BLKATOFF(dvp, (off_t)dp->i_offset, (char **)&ep, &bp)) != 0) return (error); - ep->d_ino = WINO; + ep->d_ino = UFS_WINO; ep->d_type = DT_WHT; goto out; } if ((error = UFS_BLKATOFF(dvp, (off_t)(dp->i_offset - dp->i_count), (char **)&ep, &bp)) != 0) return (error); /* Set 'rep' to the entry being removed. */ if (dp->i_count == 0) rep = ep; else rep = (struct direct *)((char *)ep + ep->d_reclen); #ifdef UFS_DIRHASH /* * Remove the dirhash entry. This is complicated by the fact * that `ep' is the previous entry when dp->i_count != 0. */ if (dp->i_dirhash != NULL) ufsdirhash_remove(dp, rep, dp->i_offset); #endif if (ip && rep->d_ino != ip->i_number) panic("ufs_dirremove: ip %ju does not match dirent ino %ju\n", (uintmax_t)ip->i_number, (uintmax_t)rep->d_ino); if (dp->i_count == 0) { /* * First entry in block: set d_ino to zero. */ ep->d_ino = 0; } else { /* * Collapse new free space into previous entry. */ ep->d_reclen += rep->d_reclen; } #ifdef UFS_DIRHASH if (dp->i_dirhash != NULL) ufsdirhash_checkblock(dp, (char *)ep - ((dp->i_offset - dp->i_count) & (DIRBLKSIZ - 1)), rounddown2(dp->i_offset, DIRBLKSIZ)); #endif out: error = 0; if (DOINGSOFTDEP(dvp)) { if (ip) softdep_setup_remove(bp, dp, ip, isrmdir); if (softdep_slowdown(dvp)) error = bwrite(bp); else bdwrite(bp); } else { if (flags & DOWHITEOUT) error = bwrite(bp); else if (DOINGASYNC(dvp)) bdwrite(bp); else error = bwrite(bp); } dp->i_flag |= IN_CHANGE | IN_UPDATE; /* * If the last named reference to a snapshot goes away, * drop its snapshot reference so that it will be reclaimed * when last open reference goes away. */ if (ip != NULL && (ip->i_flags & SF_SNAPSHOT) != 0 && ip->i_effnlink == 0) UFS_SNAPGONE(ip); return (error); } /* * Rewrite an existing directory entry to point at the inode * supplied. The parameters describing the directory entry are * set up by a call to namei. */ int ufs_dirrewrite(dp, oip, newinum, newtype, isrmdir) struct inode *dp, *oip; ino_t newinum; int newtype; int isrmdir; { struct buf *bp; struct direct *ep; struct vnode *vdp = ITOV(dp); int error; /* * Drop the link before we lock the buf so softdep can block if * necessary. */ oip->i_effnlink--; if (DOINGSOFTDEP(vdp)) { softdep_setup_unlink(dp, oip); } else { oip->i_nlink--; DIP_SET(oip, i_nlink, oip->i_nlink); oip->i_flag |= IN_CHANGE; } error = UFS_BLKATOFF(vdp, (off_t)dp->i_offset, (char **)&ep, &bp); if (error) return (error); if (ep->d_namlen == 2 && ep->d_name[1] == '.' && ep->d_name[0] == '.' && ep->d_ino != oip->i_number) { brelse(bp); return (EIDRM); } ep->d_ino = newinum; if (!OFSFMT(vdp)) ep->d_type = newtype; if (DOINGSOFTDEP(vdp)) { softdep_setup_directory_change(bp, dp, oip, newinum, isrmdir); bdwrite(bp); } else { if (DOINGASYNC(vdp)) { bdwrite(bp); error = 0; } else { error = bwrite(bp); } } dp->i_flag |= IN_CHANGE | IN_UPDATE; /* * If the last named reference to a snapshot goes away, * drop its snapshot reference so that it will be reclaimed * when last open reference goes away. */ if ((oip->i_flags & SF_SNAPSHOT) != 0 && oip->i_effnlink == 0) UFS_SNAPGONE(oip); return (error); } /* * Check if a directory is empty or not. * Inode supplied must be locked. * * Using a struct dirtemplate here is not precisely * what we want, but better than using a struct direct. * * NB: does not handle corrupted directories. */ int ufs_dirempty(ip, parentino, cred) struct inode *ip; ino_t parentino; struct ucred *cred; { doff_t off; struct dirtemplate dbuf; struct direct *dp = (struct direct *)&dbuf; int error, namlen; ssize_t count; #define MINDIRSIZ (sizeof (struct dirtemplate) / 2) for (off = 0; off < ip->i_size; off += dp->d_reclen) { error = vn_rdwr(UIO_READ, ITOV(ip), (caddr_t)dp, MINDIRSIZ, off, UIO_SYSSPACE, IO_NODELOCKED | IO_NOMACCHECK, cred, NOCRED, &count, (struct thread *)0); /* * Since we read MINDIRSIZ, residual must * be 0 unless we're at end of file. */ if (error || count != 0) return (0); /* avoid infinite loops */ if (dp->d_reclen == 0) return (0); /* skip empty entries */ - if (dp->d_ino == 0 || dp->d_ino == WINO) + if (dp->d_ino == 0 || dp->d_ino == UFS_WINO) continue; /* accept only "." and ".." */ # if (BYTE_ORDER == LITTLE_ENDIAN) if (OFSFMT(ITOV(ip))) namlen = dp->d_type; else namlen = dp->d_namlen; # else namlen = dp->d_namlen; # endif if (namlen > 2) return (0); if (dp->d_name[0] != '.') return (0); /* * At this point namlen must be 1 or 2. * 1 implies ".", 2 implies ".." if second * char is also "." */ if (namlen == 1 && dp->d_ino == ip->i_number) continue; if (dp->d_name[1] == '.' && dp->d_ino == parentino) continue; return (0); } return (1); } static int ufs_dir_dd_ino(struct vnode *vp, struct ucred *cred, ino_t *dd_ino, struct vnode **dd_vp) { struct dirtemplate dirbuf; struct vnode *ddvp; int error, namlen; ASSERT_VOP_LOCKED(vp, "ufs_dir_dd_ino"); if (vp->v_type != VDIR) return (ENOTDIR); /* * First check to see if we have it in the name cache. */ if ((ddvp = vn_dir_dd_ino(vp)) != NULL) { KASSERT(ddvp->v_mount == vp->v_mount, ("ufs_dir_dd_ino: Unexpected mount point crossing")); *dd_ino = VTOI(ddvp)->i_number; *dd_vp = ddvp; return (0); } /* * Have to read the directory. */ error = vn_rdwr(UIO_READ, vp, (caddr_t)&dirbuf, sizeof (struct dirtemplate), (off_t)0, UIO_SYSSPACE, IO_NODELOCKED | IO_NOMACCHECK, cred, NOCRED, NULL, NULL); if (error != 0) return (error); #if (BYTE_ORDER == LITTLE_ENDIAN) if (OFSFMT(vp)) namlen = dirbuf.dotdot_type; else namlen = dirbuf.dotdot_namlen; #else namlen = dirbuf.dotdot_namlen; #endif if (namlen != 2 || dirbuf.dotdot_name[0] != '.' || dirbuf.dotdot_name[1] != '.') return (ENOTDIR); *dd_ino = dirbuf.dotdot_ino; *dd_vp = NULL; return (0); } /* * Check if source directory is in the path of the target directory. */ int ufs_checkpath(ino_t source_ino, ino_t parent_ino, struct inode *target, struct ucred *cred, ino_t *wait_ino) { struct mount *mp; struct vnode *tvp, *vp, *vp1; int error; ino_t dd_ino; vp = tvp = ITOV(target); mp = vp->v_mount; *wait_ino = 0; if (target->i_number == source_ino) return (EEXIST); if (target->i_number == parent_ino) return (0); - if (target->i_number == ROOTINO) + if (target->i_number == UFS_ROOTINO) return (0); for (;;) { error = ufs_dir_dd_ino(vp, cred, &dd_ino, &vp1); if (error != 0) break; if (dd_ino == source_ino) { error = EINVAL; break; } - if (dd_ino == ROOTINO) + if (dd_ino == UFS_ROOTINO) break; if (dd_ino == parent_ino) break; if (vp1 == NULL) { error = VFS_VGET(mp, dd_ino, LK_SHARED | LK_NOWAIT, &vp1); if (error != 0) { *wait_ino = dd_ino; break; } } KASSERT(dd_ino == VTOI(vp1)->i_number, ("directory %ju reparented\n", (uintmax_t)VTOI(vp1)->i_number)); if (vp != tvp) vput(vp); vp = vp1; } if (error == ENOTDIR) panic("checkpath: .. not a directory\n"); if (vp1 != NULL) vput(vp1); if (vp != tvp) vput(vp); return (error); } Index: head/sys/ufs/ufs/ufs_vfsops.c =================================================================== --- head/sys/ufs/ufs/ufs_vfsops.c (revision 313779) +++ head/sys/ufs/ufs/ufs_vfsops.c (revision 313780) @@ -1,244 +1,244 @@ /*- * Copyright (c) 1991, 1993, 1994 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ufs_vfsops.c 8.8 (Berkeley) 5/20/95 */ #include __FBSDID("$FreeBSD$"); #include "opt_quota.h" #include "opt_ufs.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef UFS_DIRHASH #include #include #endif MALLOC_DEFINE(M_UFSMNT, "ufs_mount", "UFS mount structure"); /* * Return the root of a filesystem. */ int ufs_root(mp, flags, vpp) struct mount *mp; int flags; struct vnode **vpp; { struct vnode *nvp; int error; - error = VFS_VGET(mp, (ino_t)ROOTINO, flags, &nvp); + error = VFS_VGET(mp, (ino_t)UFS_ROOTINO, flags, &nvp); if (error) return (error); *vpp = nvp; return (0); } /* * Do operations associated with quotas */ int ufs_quotactl(mp, cmds, id, arg) struct mount *mp; int cmds; uid_t id; void *arg; { #ifndef QUOTA if ((cmds >> SUBCMDSHIFT) == Q_QUOTAON) vfs_unbusy(mp); return (EOPNOTSUPP); #else struct thread *td; int cmd, type, error; td = curthread; cmd = cmds >> SUBCMDSHIFT; type = cmds & SUBCMDMASK; if (id == -1) { switch (type) { case USRQUOTA: id = td->td_ucred->cr_ruid; break; case GRPQUOTA: id = td->td_ucred->cr_rgid; break; default: if (cmd == Q_QUOTAON) vfs_unbusy(mp); return (EINVAL); } } if ((u_int)type >= MAXQUOTAS) { if (cmd == Q_QUOTAON) vfs_unbusy(mp); return (EINVAL); } switch (cmd) { case Q_QUOTAON: error = quotaon(td, mp, type, arg); break; case Q_QUOTAOFF: error = quotaoff(td, mp, type); break; case Q_SETQUOTA32: error = setquota32(td, mp, id, type, arg); break; case Q_SETUSE32: error = setuse32(td, mp, id, type, arg); break; case Q_GETQUOTA32: error = getquota32(td, mp, id, type, arg); break; case Q_SETQUOTA: error = setquota(td, mp, id, type, arg); break; case Q_SETUSE: error = setuse(td, mp, id, type, arg); break; case Q_GETQUOTA: error = getquota(td, mp, id, type, arg); break; case Q_GETQUOTASIZE: error = getquotasize(td, mp, id, type, arg); break; case Q_SYNC: error = qsync(mp); break; default: error = EINVAL; break; } return (error); #endif } /* * Initial UFS filesystems, done only once. */ int ufs_init(vfsp) struct vfsconf *vfsp; { #ifdef QUOTA dqinit(); #endif #ifdef UFS_DIRHASH ufsdirhash_init(); #endif return (0); } /* * Uninitialise UFS filesystems, done before module unload. */ int ufs_uninit(vfsp) struct vfsconf *vfsp; { #ifdef QUOTA dquninit(); #endif #ifdef UFS_DIRHASH ufsdirhash_uninit(); #endif return (0); } /* * This is the generic part of fhtovp called after the underlying * filesystem has validated the file handle. * * Call the VFS_CHECKEXP beforehand to verify access. */ int ufs_fhtovp(mp, ufhp, flags, vpp) struct mount *mp; struct ufid *ufhp; int flags; struct vnode **vpp; { struct inode *ip; struct vnode *nvp; int error; error = VFS_VGET(mp, ufhp->ufid_ino, flags, &nvp); if (error) { *vpp = NULLVP; return (error); } ip = VTOI(nvp); if (ip->i_mode == 0 || ip->i_gen != ufhp->ufid_gen || ip->i_effnlink <= 0) { vput(nvp); *vpp = NULLVP; return (ESTALE); } *vpp = nvp; vnode_create_vobject(*vpp, DIP(ip, i_size), curthread); return (0); } Index: head/sys/ufs/ufs/ufs_vnops.c =================================================================== --- head/sys/ufs/ufs/ufs_vnops.c (revision 313779) +++ head/sys/ufs/ufs/ufs_vnops.c (revision 313780) @@ -1,2834 +1,2834 @@ /*- * Copyright (c) 1982, 1986, 1989, 1993, 1995 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ufs_vnops.c 8.27 (Berkeley) 5/27/95 */ #include __FBSDID("$FreeBSD$"); #include "opt_quota.h" #include "opt_suiddir.h" #include "opt_ufs.h" #include "opt_ffs.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* XXX */ #include #include #include #include #include #include #include #include #include #ifdef UFS_DIRHASH #include #endif #ifdef UFS_GJOURNAL #include FEATURE(ufs_gjournal, "Journaling support through GEOM for UFS"); #endif #ifdef QUOTA FEATURE(ufs_quota, "UFS disk quotas support"); FEATURE(ufs_quota64, "64bit UFS disk quotas support"); #endif #ifdef SUIDDIR FEATURE(suiddir, "Give all new files in directory the same ownership as the directory"); #endif #include static vop_accessx_t ufs_accessx; static int ufs_chmod(struct vnode *, int, struct ucred *, struct thread *); static int ufs_chown(struct vnode *, uid_t, gid_t, struct ucred *, struct thread *); static vop_close_t ufs_close; static vop_create_t ufs_create; static vop_getattr_t ufs_getattr; static vop_ioctl_t ufs_ioctl; static vop_link_t ufs_link; static int ufs_makeinode(int mode, struct vnode *, struct vnode **, struct componentname *, const char *); static vop_markatime_t ufs_markatime; static vop_mkdir_t ufs_mkdir; static vop_mknod_t ufs_mknod; static vop_open_t ufs_open; static vop_pathconf_t ufs_pathconf; static vop_print_t ufs_print; static vop_readlink_t ufs_readlink; static vop_remove_t ufs_remove; static vop_rename_t ufs_rename; static vop_rmdir_t ufs_rmdir; static vop_setattr_t ufs_setattr; static vop_strategy_t ufs_strategy; static vop_symlink_t ufs_symlink; static vop_whiteout_t ufs_whiteout; static vop_close_t ufsfifo_close; static vop_kqfilter_t ufsfifo_kqfilter; static vop_pathconf_t ufsfifo_pathconf; SYSCTL_NODE(_vfs, OID_AUTO, ufs, CTLFLAG_RD, 0, "UFS filesystem"); /* * A virgin directory (no blushing please). */ static struct dirtemplate mastertemplate = { 0, 12, DT_DIR, 1, ".", 0, DIRBLKSIZ - 12, DT_DIR, 2, ".." }; static struct odirtemplate omastertemplate = { 0, 12, 1, ".", 0, DIRBLKSIZ - 12, 2, ".." }; static void ufs_itimes_locked(struct vnode *vp) { struct inode *ip; struct timespec ts; ASSERT_VI_LOCKED(vp, __func__); ip = VTOI(vp); if (UFS_RDONLY(ip)) goto out; if ((ip->i_flag & (IN_ACCESS | IN_CHANGE | IN_UPDATE)) == 0) return; if ((vp->v_type == VBLK || vp->v_type == VCHR) && !DOINGSOFTDEP(vp)) ip->i_flag |= IN_LAZYMOD; else if (((vp->v_mount->mnt_kern_flag & (MNTK_SUSPENDED | MNTK_SUSPEND)) == 0) || (ip->i_flag & (IN_CHANGE | IN_UPDATE))) ip->i_flag |= IN_MODIFIED; else if (ip->i_flag & IN_ACCESS) ip->i_flag |= IN_LAZYACCESS; vfs_timestamp(&ts); if (ip->i_flag & IN_ACCESS) { DIP_SET(ip, i_atime, ts.tv_sec); DIP_SET(ip, i_atimensec, ts.tv_nsec); } if (ip->i_flag & IN_UPDATE) { DIP_SET(ip, i_mtime, ts.tv_sec); DIP_SET(ip, i_mtimensec, ts.tv_nsec); } if (ip->i_flag & IN_CHANGE) { DIP_SET(ip, i_ctime, ts.tv_sec); DIP_SET(ip, i_ctimensec, ts.tv_nsec); DIP_SET(ip, i_modrev, DIP(ip, i_modrev) + 1); } out: ip->i_flag &= ~(IN_ACCESS | IN_CHANGE | IN_UPDATE); } void ufs_itimes(struct vnode *vp) { VI_LOCK(vp); ufs_itimes_locked(vp); VI_UNLOCK(vp); } /* * Create a regular file */ static int ufs_create(ap) struct vop_create_args /* { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; struct vattr *a_vap; } */ *ap; { int error; error = ufs_makeinode(MAKEIMODE(ap->a_vap->va_type, ap->a_vap->va_mode), ap->a_dvp, ap->a_vpp, ap->a_cnp, "ufs_create"); if (error != 0) return (error); if ((ap->a_cnp->cn_flags & MAKEENTRY) != 0) cache_enter(ap->a_dvp, *ap->a_vpp, ap->a_cnp); return (0); } /* * Mknod vnode call */ /* ARGSUSED */ static int ufs_mknod(ap) struct vop_mknod_args /* { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; struct vattr *a_vap; } */ *ap; { struct vattr *vap = ap->a_vap; struct vnode **vpp = ap->a_vpp; struct inode *ip; ino_t ino; int error; error = ufs_makeinode(MAKEIMODE(vap->va_type, vap->va_mode), ap->a_dvp, vpp, ap->a_cnp, "ufs_mknod"); if (error) return (error); ip = VTOI(*vpp); ip->i_flag |= IN_ACCESS | IN_CHANGE | IN_UPDATE; if (vap->va_rdev != VNOVAL) { /* * Want to be able to use this to make badblock * inodes, so don't truncate the dev number. */ DIP_SET(ip, i_rdev, vap->va_rdev); } /* * Remove inode, then reload it through VFS_VGET so it is * checked to see if it is an alias of an existing entry in * the inode cache. XXX I don't believe this is necessary now. */ (*vpp)->v_type = VNON; ino = ip->i_number; /* Save this before vgone() invalidates ip. */ vgone(*vpp); vput(*vpp); error = VFS_VGET(ap->a_dvp->v_mount, ino, LK_EXCLUSIVE, vpp); if (error) { *vpp = NULL; return (error); } return (0); } /* * Open called. */ /* ARGSUSED */ static int ufs_open(struct vop_open_args *ap) { struct vnode *vp = ap->a_vp; struct inode *ip; if (vp->v_type == VCHR || vp->v_type == VBLK) return (EOPNOTSUPP); ip = VTOI(vp); /* * Files marked append-only must be opened for appending. */ if ((ip->i_flags & APPEND) && (ap->a_mode & (FWRITE | O_APPEND)) == FWRITE) return (EPERM); vnode_create_vobject(vp, DIP(ip, i_size), ap->a_td); return (0); } /* * Close called. * * Update the times on the inode. */ /* ARGSUSED */ static int ufs_close(ap) struct vop_close_args /* { struct vnode *a_vp; int a_fflag; struct ucred *a_cred; struct thread *a_td; } */ *ap; { struct vnode *vp = ap->a_vp; int usecount; VI_LOCK(vp); usecount = vp->v_usecount; if (usecount > 1) ufs_itimes_locked(vp); VI_UNLOCK(vp); return (0); } static int ufs_accessx(ap) struct vop_accessx_args /* { struct vnode *a_vp; accmode_t a_accmode; struct ucred *a_cred; struct thread *a_td; } */ *ap; { struct vnode *vp = ap->a_vp; struct inode *ip = VTOI(vp); accmode_t accmode = ap->a_accmode; int error; #ifdef QUOTA int relocked; #endif #ifdef UFS_ACL struct acl *acl; acl_type_t type; #endif /* * Disallow write attempts on read-only filesystems; * unless the file is a socket, fifo, or a block or * character device resident on the filesystem. */ if (accmode & VMODIFY_PERMS) { switch (vp->v_type) { case VDIR: case VLNK: case VREG: if (vp->v_mount->mnt_flag & MNT_RDONLY) return (EROFS); #ifdef QUOTA /* * Inode is accounted in the quotas only if struct * dquot is attached to it. VOP_ACCESS() is called * from vn_open_cred() and provides a convenient * point to call getinoquota(). */ if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE) { /* * Upgrade vnode lock, since getinoquota() * requires exclusive lock to modify inode. */ relocked = 1; vhold(vp); vn_lock(vp, LK_UPGRADE | LK_RETRY); VI_LOCK(vp); if (vp->v_iflag & VI_DOOMED) { vdropl(vp); error = ENOENT; goto relock; } vdropl(vp); } else relocked = 0; error = getinoquota(ip); relock: if (relocked) vn_lock(vp, LK_DOWNGRADE | LK_RETRY); if (error != 0) return (error); #endif break; default: break; } } /* * If immutable bit set, nobody gets to write it. "& ~VADMIN_PERMS" * permits the owner of the file to remove the IMMUTABLE flag. */ if ((accmode & (VMODIFY_PERMS & ~VADMIN_PERMS)) && (ip->i_flags & (IMMUTABLE | SF_SNAPSHOT))) return (EPERM); #ifdef UFS_ACL if ((vp->v_mount->mnt_flag & (MNT_ACLS | MNT_NFS4ACLS)) != 0) { if (vp->v_mount->mnt_flag & MNT_NFS4ACLS) type = ACL_TYPE_NFS4; else type = ACL_TYPE_ACCESS; acl = acl_alloc(M_WAITOK); if (type == ACL_TYPE_NFS4) error = ufs_getacl_nfs4_internal(vp, acl, ap->a_td); else error = VOP_GETACL(vp, type, acl, ap->a_cred, ap->a_td); switch (error) { case 0: if (type == ACL_TYPE_NFS4) { error = vaccess_acl_nfs4(vp->v_type, ip->i_uid, ip->i_gid, acl, accmode, ap->a_cred, NULL); } else { error = vfs_unixify_accmode(&accmode); if (error == 0) error = vaccess_acl_posix1e(vp->v_type, ip->i_uid, ip->i_gid, acl, accmode, ap->a_cred, NULL); } break; default: if (error != EOPNOTSUPP) printf( "ufs_accessx(): Error retrieving ACL on object (%d).\n", error); /* * XXX: Fall back until debugged. Should * eventually possibly log an error, and return * EPERM for safety. */ error = vfs_unixify_accmode(&accmode); if (error == 0) error = vaccess(vp->v_type, ip->i_mode, ip->i_uid, ip->i_gid, accmode, ap->a_cred, NULL); } acl_free(acl); return (error); } #endif /* !UFS_ACL */ error = vfs_unixify_accmode(&accmode); if (error == 0) error = vaccess(vp->v_type, ip->i_mode, ip->i_uid, ip->i_gid, accmode, ap->a_cred, NULL); return (error); } /* ARGSUSED */ static int ufs_getattr(ap) struct vop_getattr_args /* { struct vnode *a_vp; struct vattr *a_vap; struct ucred *a_cred; } */ *ap; { struct vnode *vp = ap->a_vp; struct inode *ip = VTOI(vp); struct vattr *vap = ap->a_vap; VI_LOCK(vp); ufs_itimes_locked(vp); if (I_IS_UFS1(ip)) { vap->va_atime.tv_sec = ip->i_din1->di_atime; vap->va_atime.tv_nsec = ip->i_din1->di_atimensec; } else { vap->va_atime.tv_sec = ip->i_din2->di_atime; vap->va_atime.tv_nsec = ip->i_din2->di_atimensec; } VI_UNLOCK(vp); /* * Copy from inode table */ vap->va_fsid = dev2udev(ITOUMP(ip)->um_dev); vap->va_fileid = ip->i_number; vap->va_mode = ip->i_mode & ~IFMT; vap->va_nlink = ip->i_effnlink; vap->va_uid = ip->i_uid; vap->va_gid = ip->i_gid; if (I_IS_UFS1(ip)) { vap->va_rdev = ip->i_din1->di_rdev; vap->va_size = ip->i_din1->di_size; vap->va_mtime.tv_sec = ip->i_din1->di_mtime; vap->va_mtime.tv_nsec = ip->i_din1->di_mtimensec; vap->va_ctime.tv_sec = ip->i_din1->di_ctime; vap->va_ctime.tv_nsec = ip->i_din1->di_ctimensec; vap->va_bytes = dbtob((u_quad_t)ip->i_din1->di_blocks); vap->va_filerev = ip->i_din1->di_modrev; } else { vap->va_rdev = ip->i_din2->di_rdev; vap->va_size = ip->i_din2->di_size; vap->va_mtime.tv_sec = ip->i_din2->di_mtime; vap->va_mtime.tv_nsec = ip->i_din2->di_mtimensec; vap->va_ctime.tv_sec = ip->i_din2->di_ctime; vap->va_ctime.tv_nsec = ip->i_din2->di_ctimensec; vap->va_birthtime.tv_sec = ip->i_din2->di_birthtime; vap->va_birthtime.tv_nsec = ip->i_din2->di_birthnsec; vap->va_bytes = dbtob((u_quad_t)ip->i_din2->di_blocks); vap->va_filerev = ip->i_din2->di_modrev; } vap->va_flags = ip->i_flags; vap->va_gen = ip->i_gen; vap->va_blocksize = vp->v_mount->mnt_stat.f_iosize; vap->va_type = IFTOVT(ip->i_mode); return (0); } /* * Set attribute vnode op. called from several syscalls */ static int ufs_setattr(ap) struct vop_setattr_args /* { struct vnode *a_vp; struct vattr *a_vap; struct ucred *a_cred; } */ *ap; { struct vattr *vap = ap->a_vap; struct vnode *vp = ap->a_vp; struct inode *ip = VTOI(vp); struct ucred *cred = ap->a_cred; struct thread *td = curthread; int error; /* * Check for unsettable attributes. */ if ((vap->va_type != VNON) || (vap->va_nlink != VNOVAL) || (vap->va_fsid != VNOVAL) || (vap->va_fileid != VNOVAL) || (vap->va_blocksize != VNOVAL) || (vap->va_rdev != VNOVAL) || ((int)vap->va_bytes != VNOVAL) || (vap->va_gen != VNOVAL)) { return (EINVAL); } if (vap->va_flags != VNOVAL) { if ((vap->va_flags & ~(SF_APPEND | SF_ARCHIVED | SF_IMMUTABLE | SF_NOUNLINK | SF_SNAPSHOT | UF_APPEND | UF_ARCHIVE | UF_HIDDEN | UF_IMMUTABLE | UF_NODUMP | UF_NOUNLINK | UF_OFFLINE | UF_OPAQUE | UF_READONLY | UF_REPARSE | UF_SPARSE | UF_SYSTEM)) != 0) return (EOPNOTSUPP); if (vp->v_mount->mnt_flag & MNT_RDONLY) return (EROFS); /* * Callers may only modify the file flags on objects they * have VADMIN rights for. */ if ((error = VOP_ACCESS(vp, VADMIN, cred, td))) return (error); /* * Unprivileged processes are not permitted to unset system * flags, or modify flags if any system flags are set. * Privileged non-jail processes may not modify system flags * if securelevel > 0 and any existing system flags are set. * Privileged jail processes behave like privileged non-jail * processes if the security.jail.chflags_allowed sysctl is * is non-zero; otherwise, they behave like unprivileged * processes. */ if (!priv_check_cred(cred, PRIV_VFS_SYSFLAGS, 0)) { if (ip->i_flags & (SF_NOUNLINK | SF_IMMUTABLE | SF_APPEND)) { error = securelevel_gt(cred, 0); if (error) return (error); } /* The snapshot flag cannot be toggled. */ if ((vap->va_flags ^ ip->i_flags) & SF_SNAPSHOT) return (EPERM); } else { if (ip->i_flags & (SF_NOUNLINK | SF_IMMUTABLE | SF_APPEND) || ((vap->va_flags ^ ip->i_flags) & SF_SETTABLE)) return (EPERM); } ip->i_flags = vap->va_flags; DIP_SET(ip, i_flags, vap->va_flags); ip->i_flag |= IN_CHANGE; error = UFS_UPDATE(vp, 0); if (ip->i_flags & (IMMUTABLE | APPEND)) return (error); } /* * If immutable or append, no one can change any of its attributes * except the ones already handled (in some cases, file flags * including the immutability flags themselves for the superuser). */ if (ip->i_flags & (IMMUTABLE | APPEND)) return (EPERM); /* * Go through the fields and update iff not VNOVAL. */ if (vap->va_uid != (uid_t)VNOVAL || vap->va_gid != (gid_t)VNOVAL) { if (vp->v_mount->mnt_flag & MNT_RDONLY) return (EROFS); if ((error = ufs_chown(vp, vap->va_uid, vap->va_gid, cred, td)) != 0) return (error); } if (vap->va_size != VNOVAL) { /* * XXX most of the following special cases should be in * callers instead of in N filesystems. The VDIR check * mostly already is. */ switch (vp->v_type) { case VDIR: return (EISDIR); case VLNK: case VREG: /* * Truncation should have an effect in these cases. * Disallow it if the filesystem is read-only or * the file is being snapshotted. */ if (vp->v_mount->mnt_flag & MNT_RDONLY) return (EROFS); if ((ip->i_flags & SF_SNAPSHOT) != 0) return (EPERM); break; default: /* * According to POSIX, the result is unspecified * for file types other than regular files, * directories and shared memory objects. We * don't support shared memory objects in the file * system, and have dubious support for truncating * symlinks. Just ignore the request in other cases. */ return (0); } if ((error = UFS_TRUNCATE(vp, vap->va_size, IO_NORMAL | ((vap->va_vaflags & VA_SYNC) != 0 ? IO_SYNC : 0), cred)) != 0) return (error); } if (vap->va_atime.tv_sec != VNOVAL || vap->va_mtime.tv_sec != VNOVAL || vap->va_birthtime.tv_sec != VNOVAL) { if (vp->v_mount->mnt_flag & MNT_RDONLY) return (EROFS); if ((ip->i_flags & SF_SNAPSHOT) != 0) return (EPERM); error = vn_utimes_perm(vp, vap, cred, td); if (error != 0) return (error); ip->i_flag |= IN_CHANGE | IN_MODIFIED; if (vap->va_atime.tv_sec != VNOVAL) { ip->i_flag &= ~IN_ACCESS; DIP_SET(ip, i_atime, vap->va_atime.tv_sec); DIP_SET(ip, i_atimensec, vap->va_atime.tv_nsec); } if (vap->va_mtime.tv_sec != VNOVAL) { ip->i_flag &= ~IN_UPDATE; DIP_SET(ip, i_mtime, vap->va_mtime.tv_sec); DIP_SET(ip, i_mtimensec, vap->va_mtime.tv_nsec); } if (vap->va_birthtime.tv_sec != VNOVAL && I_IS_UFS2(ip)) { ip->i_din2->di_birthtime = vap->va_birthtime.tv_sec; ip->i_din2->di_birthnsec = vap->va_birthtime.tv_nsec; } error = UFS_UPDATE(vp, 0); if (error) return (error); } error = 0; if (vap->va_mode != (mode_t)VNOVAL) { if (vp->v_mount->mnt_flag & MNT_RDONLY) return (EROFS); if ((ip->i_flags & SF_SNAPSHOT) != 0 && (vap->va_mode & (S_IXUSR | S_IWUSR | S_IXGRP | S_IWGRP | S_IXOTH | S_IWOTH))) return (EPERM); error = ufs_chmod(vp, (int)vap->va_mode, cred, td); } return (error); } #ifdef UFS_ACL static int ufs_update_nfs4_acl_after_mode_change(struct vnode *vp, int mode, int file_owner_id, struct ucred *cred, struct thread *td) { int error; struct acl *aclp; aclp = acl_alloc(M_WAITOK); error = ufs_getacl_nfs4_internal(vp, aclp, td); /* * We don't have to handle EOPNOTSUPP here, as the filesystem claims * it supports ACLs. */ if (error) goto out; acl_nfs4_sync_acl_from_mode(aclp, mode, file_owner_id); error = ufs_setacl_nfs4_internal(vp, aclp, td); out: acl_free(aclp); return (error); } #endif /* UFS_ACL */ /* * Mark this file's access time for update for vfs_mark_atime(). This * is called from execve() and mmap(). */ static int ufs_markatime(ap) struct vop_markatime_args /* { struct vnode *a_vp; } */ *ap; { struct vnode *vp = ap->a_vp; struct inode *ip = VTOI(vp); VI_LOCK(vp); ip->i_flag |= IN_ACCESS; VI_UNLOCK(vp); /* * XXXKIB No UFS_UPDATE(ap->a_vp, 0) there. */ return (0); } /* * Change the mode on a file. * Inode must be locked before calling. */ static int ufs_chmod(vp, mode, cred, td) struct vnode *vp; int mode; struct ucred *cred; struct thread *td; { struct inode *ip = VTOI(vp); int error; /* * To modify the permissions on a file, must possess VADMIN * for that file. */ if ((error = VOP_ACCESSX(vp, VWRITE_ACL, cred, td))) return (error); /* * Privileged processes may set the sticky bit on non-directories, * as well as set the setgid bit on a file with a group that the * process is not a member of. Both of these are allowed in * jail(8). */ if (vp->v_type != VDIR && (mode & S_ISTXT)) { if (priv_check_cred(cred, PRIV_VFS_STICKYFILE, 0)) return (EFTYPE); } if (!groupmember(ip->i_gid, cred) && (mode & ISGID)) { error = priv_check_cred(cred, PRIV_VFS_SETGID, 0); if (error) return (error); } /* * Deny setting setuid if we are not the file owner. */ if ((mode & ISUID) && ip->i_uid != cred->cr_uid) { error = priv_check_cred(cred, PRIV_VFS_ADMIN, 0); if (error) return (error); } ip->i_mode &= ~ALLPERMS; ip->i_mode |= (mode & ALLPERMS); DIP_SET(ip, i_mode, ip->i_mode); ip->i_flag |= IN_CHANGE; #ifdef UFS_ACL if ((vp->v_mount->mnt_flag & MNT_NFS4ACLS) != 0) error = ufs_update_nfs4_acl_after_mode_change(vp, mode, ip->i_uid, cred, td); #endif if (error == 0 && (ip->i_flag & IN_CHANGE) != 0) error = UFS_UPDATE(vp, 0); return (error); } /* * Perform chown operation on inode ip; * inode must be locked prior to call. */ static int ufs_chown(vp, uid, gid, cred, td) struct vnode *vp; uid_t uid; gid_t gid; struct ucred *cred; struct thread *td; { struct inode *ip = VTOI(vp); uid_t ouid; gid_t ogid; int error = 0; #ifdef QUOTA int i; ufs2_daddr_t change; #endif if (uid == (uid_t)VNOVAL) uid = ip->i_uid; if (gid == (gid_t)VNOVAL) gid = ip->i_gid; /* * To modify the ownership of a file, must possess VADMIN for that * file. */ if ((error = VOP_ACCESSX(vp, VWRITE_OWNER, cred, td))) return (error); /* * To change the owner of a file, or change the group of a file to a * group of which we are not a member, the caller must have * privilege. */ if (((uid != ip->i_uid && uid != cred->cr_uid) || (gid != ip->i_gid && !groupmember(gid, cred))) && (error = priv_check_cred(cred, PRIV_VFS_CHOWN, 0))) return (error); ogid = ip->i_gid; ouid = ip->i_uid; #ifdef QUOTA if ((error = getinoquota(ip)) != 0) return (error); if (ouid == uid) { dqrele(vp, ip->i_dquot[USRQUOTA]); ip->i_dquot[USRQUOTA] = NODQUOT; } if (ogid == gid) { dqrele(vp, ip->i_dquot[GRPQUOTA]); ip->i_dquot[GRPQUOTA] = NODQUOT; } change = DIP(ip, i_blocks); (void) chkdq(ip, -change, cred, CHOWN); (void) chkiq(ip, -1, cred, CHOWN); for (i = 0; i < MAXQUOTAS; i++) { dqrele(vp, ip->i_dquot[i]); ip->i_dquot[i] = NODQUOT; } #endif ip->i_gid = gid; DIP_SET(ip, i_gid, gid); ip->i_uid = uid; DIP_SET(ip, i_uid, uid); #ifdef QUOTA if ((error = getinoquota(ip)) == 0) { if (ouid == uid) { dqrele(vp, ip->i_dquot[USRQUOTA]); ip->i_dquot[USRQUOTA] = NODQUOT; } if (ogid == gid) { dqrele(vp, ip->i_dquot[GRPQUOTA]); ip->i_dquot[GRPQUOTA] = NODQUOT; } if ((error = chkdq(ip, change, cred, CHOWN)) == 0) { if ((error = chkiq(ip, 1, cred, CHOWN)) == 0) goto good; else (void) chkdq(ip, -change, cred, CHOWN|FORCE); } for (i = 0; i < MAXQUOTAS; i++) { dqrele(vp, ip->i_dquot[i]); ip->i_dquot[i] = NODQUOT; } } ip->i_gid = ogid; DIP_SET(ip, i_gid, ogid); ip->i_uid = ouid; DIP_SET(ip, i_uid, ouid); if (getinoquota(ip) == 0) { if (ouid == uid) { dqrele(vp, ip->i_dquot[USRQUOTA]); ip->i_dquot[USRQUOTA] = NODQUOT; } if (ogid == gid) { dqrele(vp, ip->i_dquot[GRPQUOTA]); ip->i_dquot[GRPQUOTA] = NODQUOT; } (void) chkdq(ip, change, cred, FORCE|CHOWN); (void) chkiq(ip, 1, cred, FORCE|CHOWN); (void) getinoquota(ip); } return (error); good: if (getinoquota(ip)) panic("ufs_chown: lost quota"); #endif /* QUOTA */ ip->i_flag |= IN_CHANGE; if ((ip->i_mode & (ISUID | ISGID)) && (ouid != uid || ogid != gid)) { if (priv_check_cred(cred, PRIV_VFS_RETAINSUGID, 0)) { ip->i_mode &= ~(ISUID | ISGID); DIP_SET(ip, i_mode, ip->i_mode); } } error = UFS_UPDATE(vp, 0); return (error); } static int ufs_remove(ap) struct vop_remove_args /* { struct vnode *a_dvp; struct vnode *a_vp; struct componentname *a_cnp; } */ *ap; { struct inode *ip; struct vnode *vp = ap->a_vp; struct vnode *dvp = ap->a_dvp; int error; struct thread *td; td = curthread; ip = VTOI(vp); if ((ip->i_flags & (NOUNLINK | IMMUTABLE | APPEND)) || (VTOI(dvp)->i_flags & APPEND)) { error = EPERM; goto out; } #ifdef UFS_GJOURNAL ufs_gjournal_orphan(vp); #endif error = ufs_dirremove(dvp, ip, ap->a_cnp->cn_flags, 0); if (ip->i_nlink <= 0) vp->v_vflag |= VV_NOSYNC; if ((ip->i_flags & SF_SNAPSHOT) != 0) { /* * Avoid deadlock where another thread is trying to * update the inodeblock for dvp and is waiting on * snaplk. Temporary unlock the vnode lock for the * unlinked file and sync the directory. This should * allow vput() of the directory to not block later on * while holding the snapshot vnode locked, assuming * that the directory hasn't been unlinked too. */ VOP_UNLOCK(vp, 0); (void) VOP_FSYNC(dvp, MNT_WAIT, td); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); } out: return (error); } static void print_bad_link_count(const char *funcname, struct vnode *dvp) { struct inode *dip; dip = VTOI(dvp); uprintf("%s: Bad link count %d on parent inode %jd in file system %s\n", funcname, dip->i_effnlink, (intmax_t)dip->i_number, dvp->v_mount->mnt_stat.f_mntonname); } /* * link vnode call */ static int ufs_link(ap) struct vop_link_args /* { struct vnode *a_tdvp; struct vnode *a_vp; struct componentname *a_cnp; } */ *ap; { struct vnode *vp = ap->a_vp; struct vnode *tdvp = ap->a_tdvp; struct componentname *cnp = ap->a_cnp; struct inode *ip; struct direct newdir; int error; #ifdef INVARIANTS if ((cnp->cn_flags & HASBUF) == 0) panic("ufs_link: no name"); #endif if (VTOI(tdvp)->i_effnlink < 2) { print_bad_link_count("ufs_link", tdvp); error = EINVAL; goto out; } ip = VTOI(vp); if ((nlink_t)ip->i_nlink >= LINK_MAX) { error = EMLINK; goto out; } /* * The file may have been removed after namei droped the original * lock. */ if (ip->i_effnlink == 0) { error = ENOENT; goto out; } if (ip->i_flags & (IMMUTABLE | APPEND)) { error = EPERM; goto out; } ip->i_effnlink++; ip->i_nlink++; DIP_SET(ip, i_nlink, ip->i_nlink); ip->i_flag |= IN_CHANGE; if (DOINGSOFTDEP(vp)) softdep_setup_link(VTOI(tdvp), ip); error = UFS_UPDATE(vp, !DOINGSOFTDEP(vp) && !DOINGASYNC(vp)); if (!error) { ufs_makedirentry(ip, cnp, &newdir); error = ufs_direnter(tdvp, vp, &newdir, cnp, NULL, 0); } if (error) { ip->i_effnlink--; ip->i_nlink--; DIP_SET(ip, i_nlink, ip->i_nlink); ip->i_flag |= IN_CHANGE; if (DOINGSOFTDEP(vp)) softdep_revert_link(VTOI(tdvp), ip); } out: return (error); } /* * whiteout vnode call */ static int ufs_whiteout(ap) struct vop_whiteout_args /* { struct vnode *a_dvp; struct componentname *a_cnp; int a_flags; } */ *ap; { struct vnode *dvp = ap->a_dvp; struct componentname *cnp = ap->a_cnp; struct direct newdir; int error = 0; switch (ap->a_flags) { case LOOKUP: /* 4.4 format directories support whiteout operations */ if (dvp->v_mount->mnt_maxsymlinklen > 0) return (0); return (EOPNOTSUPP); case CREATE: /* create a new directory whiteout */ #ifdef INVARIANTS if ((cnp->cn_flags & SAVENAME) == 0) panic("ufs_whiteout: missing name"); if (dvp->v_mount->mnt_maxsymlinklen <= 0) panic("ufs_whiteout: old format filesystem"); #endif - newdir.d_ino = WINO; + newdir.d_ino = UFS_WINO; newdir.d_namlen = cnp->cn_namelen; bcopy(cnp->cn_nameptr, newdir.d_name, (unsigned)cnp->cn_namelen + 1); newdir.d_type = DT_WHT; error = ufs_direnter(dvp, NULL, &newdir, cnp, NULL, 0); break; case DELETE: /* remove an existing directory whiteout */ #ifdef INVARIANTS if (dvp->v_mount->mnt_maxsymlinklen <= 0) panic("ufs_whiteout: old format filesystem"); #endif cnp->cn_flags &= ~DOWHITEOUT; error = ufs_dirremove(dvp, NULL, cnp->cn_flags, 0); break; default: panic("ufs_whiteout: unknown op"); } return (error); } static volatile int rename_restarts; SYSCTL_INT(_vfs_ufs, OID_AUTO, rename_restarts, CTLFLAG_RD, __DEVOLATILE(int *, &rename_restarts), 0, "Times rename had to restart due to lock contention"); /* * Rename system call. * rename("foo", "bar"); * is essentially * unlink("bar"); * link("foo", "bar"); * unlink("foo"); * but ``atomically''. Can't do full commit without saving state in the * inode on disk which isn't feasible at this time. Best we can do is * always guarantee the target exists. * * Basic algorithm is: * * 1) Bump link count on source while we're linking it to the * target. This also ensure the inode won't be deleted out * from underneath us while we work (it may be truncated by * a concurrent `trunc' or `open' for creation). * 2) Link source to destination. If destination already exists, * delete it first. * 3) Unlink source reference to inode if still around. If a * directory was moved and the parent of the destination * is different from the source, patch the ".." entry in the * directory. */ static int ufs_rename(ap) struct vop_rename_args /* { struct vnode *a_fdvp; struct vnode *a_fvp; struct componentname *a_fcnp; struct vnode *a_tdvp; struct vnode *a_tvp; struct componentname *a_tcnp; } */ *ap; { struct vnode *tvp = ap->a_tvp; struct vnode *tdvp = ap->a_tdvp; struct vnode *fvp = ap->a_fvp; struct vnode *fdvp = ap->a_fdvp; struct vnode *nvp; struct componentname *tcnp = ap->a_tcnp; struct componentname *fcnp = ap->a_fcnp; struct thread *td = fcnp->cn_thread; struct inode *fip, *tip, *tdp, *fdp; struct direct newdir; off_t endoff; int doingdirectory, newparent; int error = 0; struct mount *mp; ino_t ino; #ifdef INVARIANTS if ((tcnp->cn_flags & HASBUF) == 0 || (fcnp->cn_flags & HASBUF) == 0) panic("ufs_rename: no name"); #endif endoff = 0; mp = tdvp->v_mount; VOP_UNLOCK(tdvp, 0); if (tvp && tvp != tdvp) VOP_UNLOCK(tvp, 0); /* * Check for cross-device rename. */ if ((fvp->v_mount != tdvp->v_mount) || (tvp && (fvp->v_mount != tvp->v_mount))) { error = EXDEV; mp = NULL; goto releout; } relock: /* * We need to acquire 2 to 4 locks depending on whether tvp is NULL * and fdvp and tdvp are the same directory. Subsequently we need * to double-check all paths and in the directory rename case we * need to verify that we are not creating a directory loop. To * handle this we acquire all but fdvp using non-blocking * acquisitions. If we fail to acquire any lock in the path we will * drop all held locks, acquire the new lock in a blocking fashion, * and then release it and restart the rename. This acquire/release * step ensures that we do not spin on a lock waiting for release. */ error = vn_lock(fdvp, LK_EXCLUSIVE); if (error) goto releout; if (vn_lock(tdvp, LK_EXCLUSIVE | LK_NOWAIT) != 0) { VOP_UNLOCK(fdvp, 0); error = vn_lock(tdvp, LK_EXCLUSIVE); if (error) goto releout; VOP_UNLOCK(tdvp, 0); atomic_add_int(&rename_restarts, 1); goto relock; } /* * Re-resolve fvp to be certain it still exists and fetch the * correct vnode. */ error = ufs_lookup_ino(fdvp, NULL, fcnp, &ino); if (error) { VOP_UNLOCK(fdvp, 0); VOP_UNLOCK(tdvp, 0); goto releout; } error = VFS_VGET(mp, ino, LK_EXCLUSIVE | LK_NOWAIT, &nvp); if (error) { VOP_UNLOCK(fdvp, 0); VOP_UNLOCK(tdvp, 0); if (error != EBUSY) goto releout; error = VFS_VGET(mp, ino, LK_EXCLUSIVE, &nvp); if (error != 0) goto releout; VOP_UNLOCK(nvp, 0); vrele(fvp); fvp = nvp; atomic_add_int(&rename_restarts, 1); goto relock; } vrele(fvp); fvp = nvp; /* * Re-resolve tvp and acquire the vnode lock if present. */ error = ufs_lookup_ino(tdvp, NULL, tcnp, &ino); if (error != 0 && error != EJUSTRETURN) { VOP_UNLOCK(fdvp, 0); VOP_UNLOCK(tdvp, 0); VOP_UNLOCK(fvp, 0); goto releout; } /* * If tvp disappeared we just carry on. */ if (error == EJUSTRETURN && tvp != NULL) { vrele(tvp); tvp = NULL; } /* * Get the tvp ino if the lookup succeeded. We may have to restart * if the non-blocking acquire fails. */ if (error == 0) { nvp = NULL; error = VFS_VGET(mp, ino, LK_EXCLUSIVE | LK_NOWAIT, &nvp); if (tvp) vrele(tvp); tvp = nvp; if (error) { VOP_UNLOCK(fdvp, 0); VOP_UNLOCK(tdvp, 0); VOP_UNLOCK(fvp, 0); if (error != EBUSY) goto releout; error = VFS_VGET(mp, ino, LK_EXCLUSIVE, &nvp); if (error != 0) goto releout; vput(nvp); atomic_add_int(&rename_restarts, 1); goto relock; } } fdp = VTOI(fdvp); fip = VTOI(fvp); tdp = VTOI(tdvp); tip = NULL; if (tvp) tip = VTOI(tvp); if (tvp && ((VTOI(tvp)->i_flags & (NOUNLINK | IMMUTABLE | APPEND)) || (VTOI(tdvp)->i_flags & APPEND))) { error = EPERM; goto unlockout; } /* * Renaming a file to itself has no effect. The upper layers should * not call us in that case. However, things could change after * we drop the locks above. */ if (fvp == tvp) { error = 0; goto unlockout; } doingdirectory = 0; newparent = 0; ino = fip->i_number; if (fip->i_nlink >= LINK_MAX) { error = EMLINK; goto unlockout; } if ((fip->i_flags & (NOUNLINK | IMMUTABLE | APPEND)) || (fdp->i_flags & APPEND)) { error = EPERM; goto unlockout; } if ((fip->i_mode & IFMT) == IFDIR) { /* * Avoid ".", "..", and aliases of "." for obvious reasons. */ if ((fcnp->cn_namelen == 1 && fcnp->cn_nameptr[0] == '.') || fdp == fip || (fcnp->cn_flags | tcnp->cn_flags) & ISDOTDOT) { error = EINVAL; goto unlockout; } if (fdp->i_number != tdp->i_number) newparent = tdp->i_number; doingdirectory = 1; } if ((fvp->v_type == VDIR && fvp->v_mountedhere != NULL) || (tvp != NULL && tvp->v_type == VDIR && tvp->v_mountedhere != NULL)) { error = EXDEV; goto unlockout; } /* * If ".." must be changed (ie the directory gets a new * parent) then the source directory must not be in the * directory hierarchy above the target, as this would * orphan everything below the source directory. Also * the user must have write permission in the source so * as to be able to change "..". */ if (doingdirectory && newparent) { error = VOP_ACCESS(fvp, VWRITE, tcnp->cn_cred, tcnp->cn_thread); if (error) goto unlockout; error = ufs_checkpath(ino, fdp->i_number, tdp, tcnp->cn_cred, &ino); /* * We encountered a lock that we have to wait for. Unlock * everything else and VGET before restarting. */ if (ino) { VOP_UNLOCK(fdvp, 0); VOP_UNLOCK(fvp, 0); VOP_UNLOCK(tdvp, 0); if (tvp) VOP_UNLOCK(tvp, 0); error = VFS_VGET(mp, ino, LK_SHARED, &nvp); if (error == 0) vput(nvp); atomic_add_int(&rename_restarts, 1); goto relock; } if (error) goto unlockout; if ((tcnp->cn_flags & SAVESTART) == 0) panic("ufs_rename: lost to startdir"); } if (fip->i_effnlink == 0 || fdp->i_effnlink == 0 || tdp->i_effnlink == 0) panic("Bad effnlink fip %p, fdp %p, tdp %p", fip, fdp, tdp); /* * 1) Bump link count while we're moving stuff * around. If we crash somewhere before * completing our work, the link count * may be wrong, but correctable. */ fip->i_effnlink++; fip->i_nlink++; DIP_SET(fip, i_nlink, fip->i_nlink); fip->i_flag |= IN_CHANGE; if (DOINGSOFTDEP(fvp)) softdep_setup_link(tdp, fip); error = UFS_UPDATE(fvp, !DOINGSOFTDEP(fvp) && !DOINGASYNC(fvp)); if (error) goto bad; /* * 2) If target doesn't exist, link the target * to the source and unlink the source. * Otherwise, rewrite the target directory * entry to reference the source inode and * expunge the original entry's existence. */ if (tip == NULL) { if (ITODEV(tdp) != ITODEV(fip)) panic("ufs_rename: EXDEV"); if (doingdirectory && newparent) { /* * Account for ".." in new directory. * When source and destination have the same * parent we don't adjust the link count. The * actual link modification is completed when * .. is rewritten below. */ if ((nlink_t)tdp->i_nlink >= LINK_MAX) { error = EMLINK; goto bad; } } ufs_makedirentry(fip, tcnp, &newdir); error = ufs_direnter(tdvp, NULL, &newdir, tcnp, NULL, 1); if (error) goto bad; /* Setup tdvp for directory compaction if needed. */ if (tdp->i_count && tdp->i_endoff && tdp->i_endoff < tdp->i_size) endoff = tdp->i_endoff; } else { if (ITODEV(tip) != ITODEV(tdp) || ITODEV(tip) != ITODEV(fip)) panic("ufs_rename: EXDEV"); /* * Short circuit rename(foo, foo). */ if (tip->i_number == fip->i_number) panic("ufs_rename: same file"); /* * If the parent directory is "sticky", then the caller * must possess VADMIN for the parent directory, or the * destination of the rename. This implements append-only * directories. */ if ((tdp->i_mode & S_ISTXT) && VOP_ACCESS(tdvp, VADMIN, tcnp->cn_cred, td) && VOP_ACCESS(tvp, VADMIN, tcnp->cn_cred, td)) { error = EPERM; goto bad; } /* * Target must be empty if a directory and have no links * to it. Also, ensure source and target are compatible * (both directories, or both not directories). */ if ((tip->i_mode & IFMT) == IFDIR) { if ((tip->i_effnlink > 2) || !ufs_dirempty(tip, tdp->i_number, tcnp->cn_cred)) { error = ENOTEMPTY; goto bad; } if (!doingdirectory) { error = ENOTDIR; goto bad; } cache_purge(tdvp); } else if (doingdirectory) { error = EISDIR; goto bad; } if (doingdirectory) { if (!newparent) { tdp->i_effnlink--; if (DOINGSOFTDEP(tdvp)) softdep_change_linkcnt(tdp); } tip->i_effnlink--; if (DOINGSOFTDEP(tvp)) softdep_change_linkcnt(tip); } error = ufs_dirrewrite(tdp, tip, fip->i_number, IFTODT(fip->i_mode), (doingdirectory && newparent) ? newparent : doingdirectory); if (error) { if (doingdirectory) { if (!newparent) { tdp->i_effnlink++; if (DOINGSOFTDEP(tdvp)) softdep_change_linkcnt(tdp); } tip->i_effnlink++; if (DOINGSOFTDEP(tvp)) softdep_change_linkcnt(tip); } } if (doingdirectory && !DOINGSOFTDEP(tvp)) { /* * The only stuff left in the directory is "." * and "..". The "." reference is inconsequential * since we are quashing it. We have removed the "." * reference and the reference in the parent directory, * but there may be other hard links. The soft * dependency code will arrange to do these operations * after the parent directory entry has been deleted on * disk, so when running with that code we avoid doing * them now. */ if (!newparent) { tdp->i_nlink--; DIP_SET(tdp, i_nlink, tdp->i_nlink); tdp->i_flag |= IN_CHANGE; } tip->i_nlink--; DIP_SET(tip, i_nlink, tip->i_nlink); tip->i_flag |= IN_CHANGE; } } /* * 3) Unlink the source. We have to resolve the path again to * fixup the directory offset and count for ufs_dirremove. */ if (fdvp == tdvp) { error = ufs_lookup_ino(fdvp, NULL, fcnp, &ino); if (error) panic("ufs_rename: from entry went away!"); if (ino != fip->i_number) panic("ufs_rename: ino mismatch %ju != %ju\n", (uintmax_t)ino, (uintmax_t)fip->i_number); } /* * If the source is a directory with a * new parent, the link count of the old * parent directory must be decremented * and ".." set to point to the new parent. */ if (doingdirectory && newparent) { /* * If tip exists we simply use its link, otherwise we must * add a new one. */ if (tip == NULL) { tdp->i_effnlink++; tdp->i_nlink++; DIP_SET(tdp, i_nlink, tdp->i_nlink); tdp->i_flag |= IN_CHANGE; if (DOINGSOFTDEP(tdvp)) softdep_setup_dotdot_link(tdp, fip); error = UFS_UPDATE(tdvp, !DOINGSOFTDEP(tdvp) && !DOINGASYNC(tdvp)); /* Don't go to bad here as the new link exists. */ if (error) goto unlockout; } else if (DOINGSUJ(tdvp)) /* Journal must account for each new link. */ softdep_setup_dotdot_link(tdp, fip); fip->i_offset = mastertemplate.dot_reclen; ufs_dirrewrite(fip, fdp, newparent, DT_DIR, 0); cache_purge(fdvp); } error = ufs_dirremove(fdvp, fip, fcnp->cn_flags, 0); /* * The kern_renameat() looks up the fvp using the DELETE flag, which * causes the removal of the name cache entry for fvp. * As the relookup of the fvp is done in two steps: * ufs_lookup_ino() and then VFS_VGET(), another thread might do a * normal lookup of the from name just before the VFS_VGET() call, * causing the cache entry to be re-instantiated. * * The same issue also applies to tvp if it exists as * otherwise we may have a stale name cache entry for the new * name that references the old i-node if it has other links * or open file descriptors. */ cache_purge(fvp); if (tvp) cache_purge(tvp); cache_purge_negative(tdvp); unlockout: vput(fdvp); vput(fvp); if (tvp) vput(tvp); /* * If compaction or fsync was requested do it now that other locks * are no longer needed. */ if (error == 0 && endoff != 0) { error = UFS_TRUNCATE(tdvp, endoff, IO_NORMAL | (DOINGASYNC(tdvp) ? 0 : IO_SYNC), tcnp->cn_cred); if (error != 0) vn_printf(tdvp, "ufs_rename: failed to truncate " "err %d", error); #ifdef UFS_DIRHASH else if (tdp->i_dirhash != NULL) ufsdirhash_dirtrunc(tdp, endoff); #endif /* * Even if the directory compaction failed, rename was * succesful. Do not propagate a UFS_TRUNCATE() error * to the caller. */ error = 0; } if (error == 0 && tdp->i_flag & IN_NEEDSYNC) error = VOP_FSYNC(tdvp, MNT_WAIT, td); vput(tdvp); return (error); bad: fip->i_effnlink--; fip->i_nlink--; DIP_SET(fip, i_nlink, fip->i_nlink); fip->i_flag |= IN_CHANGE; if (DOINGSOFTDEP(fvp)) softdep_revert_link(tdp, fip); goto unlockout; releout: vrele(fdvp); vrele(fvp); vrele(tdvp); if (tvp) vrele(tvp); return (error); } #ifdef UFS_ACL static int ufs_do_posix1e_acl_inheritance_dir(struct vnode *dvp, struct vnode *tvp, mode_t dmode, struct ucred *cred, struct thread *td) { int error; struct inode *ip = VTOI(tvp); struct acl *dacl, *acl; acl = acl_alloc(M_WAITOK); dacl = acl_alloc(M_WAITOK); /* * Retrieve default ACL from parent, if any. */ error = VOP_GETACL(dvp, ACL_TYPE_DEFAULT, acl, cred, td); switch (error) { case 0: /* * Retrieved a default ACL, so merge mode and ACL if * necessary. If the ACL is empty, fall through to * the "not defined or available" case. */ if (acl->acl_cnt != 0) { dmode = acl_posix1e_newfilemode(dmode, acl); ip->i_mode = dmode; DIP_SET(ip, i_mode, dmode); *dacl = *acl; ufs_sync_acl_from_inode(ip, acl); break; } /* FALLTHROUGH */ case EOPNOTSUPP: /* * Just use the mode as-is. */ ip->i_mode = dmode; DIP_SET(ip, i_mode, dmode); error = 0; goto out; default: goto out; } /* * XXX: If we abort now, will Soft Updates notify the extattr * code that the EAs for the file need to be released? */ error = VOP_SETACL(tvp, ACL_TYPE_ACCESS, acl, cred, td); if (error == 0) error = VOP_SETACL(tvp, ACL_TYPE_DEFAULT, dacl, cred, td); switch (error) { case 0: break; case EOPNOTSUPP: /* * XXX: This should not happen, as EOPNOTSUPP above * was supposed to free acl. */ printf("ufs_mkdir: VOP_GETACL() but no VOP_SETACL()\n"); /* panic("ufs_mkdir: VOP_GETACL() but no VOP_SETACL()"); */ break; default: goto out; } out: acl_free(acl); acl_free(dacl); return (error); } static int ufs_do_posix1e_acl_inheritance_file(struct vnode *dvp, struct vnode *tvp, mode_t mode, struct ucred *cred, struct thread *td) { int error; struct inode *ip = VTOI(tvp); struct acl *acl; acl = acl_alloc(M_WAITOK); /* * Retrieve default ACL for parent, if any. */ error = VOP_GETACL(dvp, ACL_TYPE_DEFAULT, acl, cred, td); switch (error) { case 0: /* * Retrieved a default ACL, so merge mode and ACL if * necessary. */ if (acl->acl_cnt != 0) { /* * Two possible ways for default ACL to not * be present. First, the EA can be * undefined, or second, the default ACL can * be blank. If it's blank, fall through to * the it's not defined case. */ mode = acl_posix1e_newfilemode(mode, acl); ip->i_mode = mode; DIP_SET(ip, i_mode, mode); ufs_sync_acl_from_inode(ip, acl); break; } /* FALLTHROUGH */ case EOPNOTSUPP: /* * Just use the mode as-is. */ ip->i_mode = mode; DIP_SET(ip, i_mode, mode); error = 0; goto out; default: goto out; } /* * XXX: If we abort now, will Soft Updates notify the extattr * code that the EAs for the file need to be released? */ error = VOP_SETACL(tvp, ACL_TYPE_ACCESS, acl, cred, td); switch (error) { case 0: break; case EOPNOTSUPP: /* * XXX: This should not happen, as EOPNOTSUPP above was * supposed to free acl. */ printf("ufs_do_posix1e_acl_inheritance_file: VOP_GETACL() " "but no VOP_SETACL()\n"); /* panic("ufs_do_posix1e_acl_inheritance_file: VOP_GETACL() " "but no VOP_SETACL()"); */ break; default: goto out; } out: acl_free(acl); return (error); } static int ufs_do_nfs4_acl_inheritance(struct vnode *dvp, struct vnode *tvp, mode_t child_mode, struct ucred *cred, struct thread *td) { int error; struct acl *parent_aclp, *child_aclp; parent_aclp = acl_alloc(M_WAITOK); child_aclp = acl_alloc(M_WAITOK | M_ZERO); error = ufs_getacl_nfs4_internal(dvp, parent_aclp, td); if (error) goto out; acl_nfs4_compute_inherited_acl(parent_aclp, child_aclp, child_mode, VTOI(tvp)->i_uid, tvp->v_type == VDIR); error = ufs_setacl_nfs4_internal(tvp, child_aclp, td); if (error) goto out; out: acl_free(parent_aclp); acl_free(child_aclp); return (error); } #endif /* * Mkdir system call */ static int ufs_mkdir(ap) struct vop_mkdir_args /* { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; struct vattr *a_vap; } */ *ap; { struct vnode *dvp = ap->a_dvp; struct vattr *vap = ap->a_vap; struct componentname *cnp = ap->a_cnp; struct inode *ip, *dp; struct vnode *tvp; struct buf *bp; struct dirtemplate dirtemplate, *dtp; struct direct newdir; int error, dmode; long blkoff; #ifdef INVARIANTS if ((cnp->cn_flags & HASBUF) == 0) panic("ufs_mkdir: no name"); #endif dp = VTOI(dvp); if ((nlink_t)dp->i_nlink >= LINK_MAX) { error = EMLINK; goto out; } dmode = vap->va_mode & 0777; dmode |= IFDIR; /* * Must simulate part of ufs_makeinode here to acquire the inode, * but not have it entered in the parent directory. The entry is * made later after writing "." and ".." entries. */ if (dp->i_effnlink < 2) { print_bad_link_count("ufs_mkdir", dvp); error = EINVAL; goto out; } error = UFS_VALLOC(dvp, dmode, cnp->cn_cred, &tvp); if (error) goto out; ip = VTOI(tvp); ip->i_gid = dp->i_gid; DIP_SET(ip, i_gid, dp->i_gid); #ifdef SUIDDIR { #ifdef QUOTA struct ucred ucred, *ucp; gid_t ucred_group; ucp = cnp->cn_cred; #endif /* * If we are hacking owners here, (only do this where told to) * and we are not giving it TO root, (would subvert quotas) * then go ahead and give it to the other user. * The new directory also inherits the SUID bit. * If user's UID and dir UID are the same, * 'give it away' so that the SUID is still forced on. */ if ((dvp->v_mount->mnt_flag & MNT_SUIDDIR) && (dp->i_mode & ISUID) && dp->i_uid) { dmode |= ISUID; ip->i_uid = dp->i_uid; DIP_SET(ip, i_uid, dp->i_uid); #ifdef QUOTA if (dp->i_uid != cnp->cn_cred->cr_uid) { /* * Make sure the correct user gets charged * for the space. * Make a dummy credential for the victim. * XXX This seems to never be accessed out of * our context so a stack variable is ok. */ refcount_init(&ucred.cr_ref, 1); ucred.cr_uid = ip->i_uid; ucred.cr_ngroups = 1; ucred.cr_groups = &ucred_group; ucred.cr_groups[0] = dp->i_gid; ucp = &ucred; } #endif } else { ip->i_uid = cnp->cn_cred->cr_uid; DIP_SET(ip, i_uid, ip->i_uid); } #ifdef QUOTA if ((error = getinoquota(ip)) || (error = chkiq(ip, 1, ucp, 0))) { if (DOINGSOFTDEP(tvp)) softdep_revert_link(dp, ip); UFS_VFREE(tvp, ip->i_number, dmode); vput(tvp); return (error); } #endif } #else /* !SUIDDIR */ ip->i_uid = cnp->cn_cred->cr_uid; DIP_SET(ip, i_uid, ip->i_uid); #ifdef QUOTA if ((error = getinoquota(ip)) || (error = chkiq(ip, 1, cnp->cn_cred, 0))) { if (DOINGSOFTDEP(tvp)) softdep_revert_link(dp, ip); UFS_VFREE(tvp, ip->i_number, dmode); vput(tvp); return (error); } #endif #endif /* !SUIDDIR */ ip->i_flag |= IN_ACCESS | IN_CHANGE | IN_UPDATE; ip->i_mode = dmode; DIP_SET(ip, i_mode, dmode); tvp->v_type = VDIR; /* Rest init'd in getnewvnode(). */ ip->i_effnlink = 2; ip->i_nlink = 2; DIP_SET(ip, i_nlink, 2); if (cnp->cn_flags & ISWHITEOUT) { ip->i_flags |= UF_OPAQUE; DIP_SET(ip, i_flags, ip->i_flags); } /* * Bump link count in parent directory to reflect work done below. * Should be done before reference is created so cleanup is * possible if we crash. */ dp->i_effnlink++; dp->i_nlink++; DIP_SET(dp, i_nlink, dp->i_nlink); dp->i_flag |= IN_CHANGE; if (DOINGSOFTDEP(dvp)) softdep_setup_mkdir(dp, ip); error = UFS_UPDATE(dvp, !DOINGSOFTDEP(dvp) && !DOINGASYNC(dvp)); if (error) goto bad; #ifdef MAC if (dvp->v_mount->mnt_flag & MNT_MULTILABEL) { error = mac_vnode_create_extattr(cnp->cn_cred, dvp->v_mount, dvp, tvp, cnp); if (error) goto bad; } #endif #ifdef UFS_ACL if (dvp->v_mount->mnt_flag & MNT_ACLS) { error = ufs_do_posix1e_acl_inheritance_dir(dvp, tvp, dmode, cnp->cn_cred, cnp->cn_thread); if (error) goto bad; } else if (dvp->v_mount->mnt_flag & MNT_NFS4ACLS) { error = ufs_do_nfs4_acl_inheritance(dvp, tvp, dmode, cnp->cn_cred, cnp->cn_thread); if (error) goto bad; } #endif /* !UFS_ACL */ /* * Initialize directory with "." and ".." from static template. */ if (dvp->v_mount->mnt_maxsymlinklen > 0) dtp = &mastertemplate; else dtp = (struct dirtemplate *)&omastertemplate; dirtemplate = *dtp; dirtemplate.dot_ino = ip->i_number; dirtemplate.dotdot_ino = dp->i_number; vnode_pager_setsize(tvp, DIRBLKSIZ); if ((error = UFS_BALLOC(tvp, (off_t)0, DIRBLKSIZ, cnp->cn_cred, BA_CLRBUF, &bp)) != 0) goto bad; ip->i_size = DIRBLKSIZ; DIP_SET(ip, i_size, DIRBLKSIZ); ip->i_flag |= IN_CHANGE | IN_UPDATE; bcopy((caddr_t)&dirtemplate, (caddr_t)bp->b_data, sizeof dirtemplate); if (DOINGSOFTDEP(tvp)) { /* * Ensure that the entire newly allocated block is a * valid directory so that future growth within the * block does not have to ensure that the block is * written before the inode. */ blkoff = DIRBLKSIZ; while (blkoff < bp->b_bcount) { ((struct direct *) (bp->b_data + blkoff))->d_reclen = DIRBLKSIZ; blkoff += DIRBLKSIZ; } } if ((error = UFS_UPDATE(tvp, !DOINGSOFTDEP(tvp) && !DOINGASYNC(tvp))) != 0) { (void)bwrite(bp); goto bad; } /* * Directory set up, now install its entry in the parent directory. * * If we are not doing soft dependencies, then we must write out the * buffer containing the new directory body before entering the new * name in the parent. If we are doing soft dependencies, then the * buffer containing the new directory body will be passed to and * released in the soft dependency code after the code has attached * an appropriate ordering dependency to the buffer which ensures that * the buffer is written before the new name is written in the parent. */ if (DOINGASYNC(dvp)) bdwrite(bp); else if (!DOINGSOFTDEP(dvp) && ((error = bwrite(bp)))) goto bad; ufs_makedirentry(ip, cnp, &newdir); error = ufs_direnter(dvp, tvp, &newdir, cnp, bp, 0); bad: if (error == 0) { *ap->a_vpp = tvp; } else { dp->i_effnlink--; dp->i_nlink--; DIP_SET(dp, i_nlink, dp->i_nlink); dp->i_flag |= IN_CHANGE; /* * No need to do an explicit VOP_TRUNCATE here, vrele will * do this for us because we set the link count to 0. */ ip->i_effnlink = 0; ip->i_nlink = 0; DIP_SET(ip, i_nlink, 0); ip->i_flag |= IN_CHANGE; if (DOINGSOFTDEP(tvp)) softdep_revert_mkdir(dp, ip); vput(tvp); } out: return (error); } /* * Rmdir system call. */ static int ufs_rmdir(ap) struct vop_rmdir_args /* { struct vnode *a_dvp; struct vnode *a_vp; struct componentname *a_cnp; } */ *ap; { struct vnode *vp = ap->a_vp; struct vnode *dvp = ap->a_dvp; struct componentname *cnp = ap->a_cnp; struct inode *ip, *dp; int error; ip = VTOI(vp); dp = VTOI(dvp); /* * Do not remove a directory that is in the process of being renamed. * Verify the directory is empty (and valid). Rmdir ".." will not be * valid since ".." will contain a reference to the current directory * and thus be non-empty. Do not allow the removal of mounted on * directories (this can happen when an NFS exported filesystem * tries to remove a locally mounted on directory). */ error = 0; if (dp->i_effnlink <= 2) { if (dp->i_effnlink == 2) print_bad_link_count("ufs_rmdir", dvp); error = EINVAL; goto out; } if (!ufs_dirempty(ip, dp->i_number, cnp->cn_cred)) { error = ENOTEMPTY; goto out; } if ((dp->i_flags & APPEND) || (ip->i_flags & (NOUNLINK | IMMUTABLE | APPEND))) { error = EPERM; goto out; } if (vp->v_mountedhere != 0) { error = EINVAL; goto out; } #ifdef UFS_GJOURNAL ufs_gjournal_orphan(vp); #endif /* * Delete reference to directory before purging * inode. If we crash in between, the directory * will be reattached to lost+found, */ dp->i_effnlink--; ip->i_effnlink--; if (DOINGSOFTDEP(vp)) softdep_setup_rmdir(dp, ip); error = ufs_dirremove(dvp, ip, cnp->cn_flags, 1); if (error) { dp->i_effnlink++; ip->i_effnlink++; if (DOINGSOFTDEP(vp)) softdep_revert_rmdir(dp, ip); goto out; } cache_purge(dvp); /* * The only stuff left in the directory is "." and "..". The "." * reference is inconsequential since we are quashing it. The soft * dependency code will arrange to do these operations after * the parent directory entry has been deleted on disk, so * when running with that code we avoid doing them now. */ if (!DOINGSOFTDEP(vp)) { dp->i_nlink--; DIP_SET(dp, i_nlink, dp->i_nlink); dp->i_flag |= IN_CHANGE; error = UFS_UPDATE(dvp, 0); ip->i_nlink--; DIP_SET(ip, i_nlink, ip->i_nlink); ip->i_flag |= IN_CHANGE; } cache_purge(vp); #ifdef UFS_DIRHASH /* Kill any active hash; i_effnlink == 0, so it will not come back. */ if (ip->i_dirhash != NULL) ufsdirhash_free(ip); #endif out: return (error); } /* * symlink -- make a symbolic link */ static int ufs_symlink(ap) struct vop_symlink_args /* { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; struct vattr *a_vap; char *a_target; } */ *ap; { struct vnode *vp, **vpp = ap->a_vpp; struct inode *ip; int len, error; error = ufs_makeinode(IFLNK | ap->a_vap->va_mode, ap->a_dvp, vpp, ap->a_cnp, "ufs_symlink"); if (error) return (error); vp = *vpp; len = strlen(ap->a_target); if (len < vp->v_mount->mnt_maxsymlinklen) { ip = VTOI(vp); bcopy(ap->a_target, SHORTLINK(ip), len); ip->i_size = len; DIP_SET(ip, i_size, len); ip->i_flag |= IN_CHANGE | IN_UPDATE; error = UFS_UPDATE(vp, 0); } else error = vn_rdwr(UIO_WRITE, vp, ap->a_target, len, (off_t)0, UIO_SYSSPACE, IO_NODELOCKED | IO_NOMACCHECK, ap->a_cnp->cn_cred, NOCRED, NULL, NULL); if (error) vput(vp); return (error); } /* * Vnode op for reading directories. */ int ufs_readdir(ap) struct vop_readdir_args /* { struct vnode *a_vp; struct uio *a_uio; struct ucred *a_cred; int *a_eofflag; int *a_ncookies; u_long **a_cookies; } */ *ap; { struct vnode *vp = ap->a_vp; struct uio *uio = ap->a_uio; struct buf *bp; struct inode *ip; struct direct *dp, *edp; u_long *cookies; struct dirent dstdp; off_t offset, startoffset; size_t readcnt, skipcnt; ssize_t startresid; int ncookies; int error; if (uio->uio_offset < 0) return (EINVAL); ip = VTOI(vp); if (ip->i_effnlink == 0) return (0); if (ap->a_ncookies != NULL) { ncookies = uio->uio_resid; if (uio->uio_offset >= ip->i_size) ncookies = 0; else if (ip->i_size - uio->uio_offset < ncookies) ncookies = ip->i_size - uio->uio_offset; ncookies = ncookies / (offsetof(struct direct, d_name) + 4) + 1; cookies = malloc(ncookies * sizeof(*cookies), M_TEMP, M_WAITOK); *ap->a_ncookies = ncookies; *ap->a_cookies = cookies; } else { ncookies = 0; cookies = NULL; } offset = startoffset = uio->uio_offset; startresid = uio->uio_resid; error = 0; while (error == 0 && uio->uio_resid > 0 && uio->uio_offset < ip->i_size) { error = ffs_blkatoff(vp, uio->uio_offset, NULL, &bp); if (error) break; if (bp->b_offset + bp->b_bcount > ip->i_size) readcnt = ip->i_size - bp->b_offset; else readcnt = bp->b_bcount; skipcnt = (size_t)(uio->uio_offset - bp->b_offset) & ~(size_t)(DIRBLKSIZ - 1); offset = bp->b_offset + skipcnt; dp = (struct direct *)&bp->b_data[skipcnt]; edp = (struct direct *)&bp->b_data[readcnt]; while (error == 0 && uio->uio_resid > 0 && dp < edp) { if (dp->d_reclen <= offsetof(struct direct, d_name) || (caddr_t)dp + dp->d_reclen > (caddr_t)edp) { error = EIO; break; } #if BYTE_ORDER == LITTLE_ENDIAN /* Old filesystem format. */ if (vp->v_mount->mnt_maxsymlinklen <= 0) { dstdp.d_namlen = dp->d_type; dstdp.d_type = dp->d_namlen; } else #endif { dstdp.d_namlen = dp->d_namlen; dstdp.d_type = dp->d_type; } if (offsetof(struct direct, d_name) + dstdp.d_namlen > dp->d_reclen) { error = EIO; break; } if (offset < startoffset || dp->d_ino == 0) goto nextentry; dstdp.d_fileno = dp->d_ino; dstdp.d_reclen = GENERIC_DIRSIZ(&dstdp); bcopy(dp->d_name, dstdp.d_name, dstdp.d_namlen); dstdp.d_name[dstdp.d_namlen] = '\0'; if (dstdp.d_reclen > uio->uio_resid) { if (uio->uio_resid == startresid) error = EINVAL; else error = EJUSTRETURN; break; } /* Advance dp. */ error = uiomove((caddr_t)&dstdp, dstdp.d_reclen, uio); if (error) break; if (cookies != NULL) { KASSERT(ncookies > 0, ("ufs_readdir: cookies buffer too small")); *cookies = offset + dp->d_reclen; cookies++; ncookies--; } nextentry: offset += dp->d_reclen; dp = (struct direct *)((caddr_t)dp + dp->d_reclen); } bqrelse(bp); uio->uio_offset = offset; } /* We need to correct uio_offset. */ uio->uio_offset = offset; if (error == EJUSTRETURN) error = 0; if (ap->a_ncookies != NULL) { if (error == 0) { ap->a_ncookies -= ncookies; } else { free(*ap->a_cookies, M_TEMP); *ap->a_ncookies = 0; *ap->a_cookies = NULL; } } if (error == 0 && ap->a_eofflag) *ap->a_eofflag = ip->i_size <= uio->uio_offset; return (error); } /* * Return target name of a symbolic link */ static int ufs_readlink(ap) struct vop_readlink_args /* { struct vnode *a_vp; struct uio *a_uio; struct ucred *a_cred; } */ *ap; { struct vnode *vp = ap->a_vp; struct inode *ip = VTOI(vp); doff_t isize; isize = ip->i_size; if ((isize < vp->v_mount->mnt_maxsymlinklen) || DIP(ip, i_blocks) == 0) { /* XXX - for old fastlink support */ return (uiomove(SHORTLINK(ip), isize, ap->a_uio)); } return (VOP_READ(vp, ap->a_uio, 0, ap->a_cred)); } /* * Calculate the logical to physical mapping if not done already, * then call the device strategy routine. * * In order to be able to swap to a file, the ufs_bmaparray() operation may not * deadlock on memory. See ufs_bmap() for details. */ static int ufs_strategy(ap) struct vop_strategy_args /* { struct vnode *a_vp; struct buf *a_bp; } */ *ap; { struct buf *bp = ap->a_bp; struct vnode *vp = ap->a_vp; ufs2_daddr_t blkno; int error; if (bp->b_blkno == bp->b_lblkno) { error = ufs_bmaparray(vp, bp->b_lblkno, &blkno, bp, NULL, NULL); bp->b_blkno = blkno; if (error) { bp->b_error = error; bp->b_ioflags |= BIO_ERROR; bufdone(bp); return (0); } if ((long)bp->b_blkno == -1) vfs_bio_clrbuf(bp); } if ((long)bp->b_blkno == -1) { bufdone(bp); return (0); } bp->b_iooffset = dbtob(bp->b_blkno); BO_STRATEGY(VFSTOUFS(vp->v_mount)->um_bo, bp); return (0); } /* * Print out the contents of an inode. */ static int ufs_print(ap) struct vop_print_args /* { struct vnode *a_vp; } */ *ap; { struct vnode *vp = ap->a_vp; struct inode *ip = VTOI(vp); printf("\tino %lu, on dev %s", (u_long)ip->i_number, devtoname(ITODEV(ip))); if (vp->v_type == VFIFO) fifo_printinfo(vp); printf("\n"); return (0); } /* * Close wrapper for fifos. * * Update the times on the inode then do device close. */ static int ufsfifo_close(ap) struct vop_close_args /* { struct vnode *a_vp; int a_fflag; struct ucred *a_cred; struct thread *a_td; } */ *ap; { struct vnode *vp = ap->a_vp; int usecount; VI_LOCK(vp); usecount = vp->v_usecount; if (usecount > 1) ufs_itimes_locked(vp); VI_UNLOCK(vp); return (fifo_specops.vop_close(ap)); } /* * Kqfilter wrapper for fifos. * * Fall through to ufs kqfilter routines if needed */ static int ufsfifo_kqfilter(ap) struct vop_kqfilter_args *ap; { int error; error = fifo_specops.vop_kqfilter(ap); if (error) error = vfs_kqfilter(ap); return (error); } /* * Return POSIX pathconf information applicable to fifos. */ static int ufsfifo_pathconf(ap) struct vop_pathconf_args /* { struct vnode *a_vp; int a_name; int *a_retval; } */ *ap; { switch (ap->a_name) { case _PC_ACL_EXTENDED: case _PC_ACL_NFS4: case _PC_ACL_PATH_MAX: case _PC_MAC_PRESENT: return (ufs_pathconf(ap)); default: return (fifo_specops.vop_pathconf(ap)); } /* NOTREACHED */ } /* * Return POSIX pathconf information applicable to ufs filesystems. */ static int ufs_pathconf(ap) struct vop_pathconf_args /* { struct vnode *a_vp; int a_name; int *a_retval; } */ *ap; { int error; error = 0; switch (ap->a_name) { case _PC_LINK_MAX: *ap->a_retval = LINK_MAX; break; case _PC_NAME_MAX: *ap->a_retval = NAME_MAX; break; case _PC_PATH_MAX: *ap->a_retval = PATH_MAX; break; case _PC_PIPE_BUF: *ap->a_retval = PIPE_BUF; break; case _PC_CHOWN_RESTRICTED: *ap->a_retval = 1; break; case _PC_NO_TRUNC: *ap->a_retval = 1; break; case _PC_ACL_EXTENDED: #ifdef UFS_ACL if (ap->a_vp->v_mount->mnt_flag & MNT_ACLS) *ap->a_retval = 1; else *ap->a_retval = 0; #else *ap->a_retval = 0; #endif break; case _PC_ACL_NFS4: #ifdef UFS_ACL if (ap->a_vp->v_mount->mnt_flag & MNT_NFS4ACLS) *ap->a_retval = 1; else *ap->a_retval = 0; #else *ap->a_retval = 0; #endif break; case _PC_ACL_PATH_MAX: #ifdef UFS_ACL if (ap->a_vp->v_mount->mnt_flag & (MNT_ACLS | MNT_NFS4ACLS)) *ap->a_retval = ACL_MAX_ENTRIES; else *ap->a_retval = 3; #else *ap->a_retval = 3; #endif break; case _PC_MAC_PRESENT: #ifdef MAC if (ap->a_vp->v_mount->mnt_flag & MNT_MULTILABEL) *ap->a_retval = 1; else *ap->a_retval = 0; #else *ap->a_retval = 0; #endif break; case _PC_MIN_HOLE_SIZE: *ap->a_retval = ap->a_vp->v_mount->mnt_stat.f_iosize; break; case _PC_ASYNC_IO: /* _PC_ASYNC_IO should have been handled by upper layers. */ KASSERT(0, ("_PC_ASYNC_IO should not get here")); error = EINVAL; break; case _PC_PRIO_IO: *ap->a_retval = 0; break; case _PC_SYNC_IO: *ap->a_retval = 0; break; case _PC_ALLOC_SIZE_MIN: *ap->a_retval = ap->a_vp->v_mount->mnt_stat.f_bsize; break; case _PC_FILESIZEBITS: *ap->a_retval = 64; break; case _PC_REC_INCR_XFER_SIZE: *ap->a_retval = ap->a_vp->v_mount->mnt_stat.f_iosize; break; case _PC_REC_MAX_XFER_SIZE: *ap->a_retval = -1; /* means ``unlimited'' */ break; case _PC_REC_MIN_XFER_SIZE: *ap->a_retval = ap->a_vp->v_mount->mnt_stat.f_iosize; break; case _PC_REC_XFER_ALIGN: *ap->a_retval = PAGE_SIZE; break; case _PC_SYMLINK_MAX: *ap->a_retval = MAXPATHLEN; break; default: error = EINVAL; break; } return (error); } /* * Initialize the vnode associated with a new inode, handle aliased * vnodes. */ int ufs_vinit(mntp, fifoops, vpp) struct mount *mntp; struct vop_vector *fifoops; struct vnode **vpp; { struct inode *ip; struct vnode *vp; vp = *vpp; ip = VTOI(vp); vp->v_type = IFTOVT(ip->i_mode); if (vp->v_type == VFIFO) vp->v_op = fifoops; ASSERT_VOP_LOCKED(vp, "ufs_vinit"); - if (ip->i_number == ROOTINO) + if (ip->i_number == UFS_ROOTINO) vp->v_vflag |= VV_ROOT; *vpp = vp; return (0); } /* * Allocate a new inode. * Vnode dvp must be locked. */ static int ufs_makeinode(mode, dvp, vpp, cnp, callfunc) int mode; struct vnode *dvp; struct vnode **vpp; struct componentname *cnp; const char *callfunc; { struct inode *ip, *pdir; struct direct newdir; struct vnode *tvp; int error; pdir = VTOI(dvp); #ifdef INVARIANTS if ((cnp->cn_flags & HASBUF) == 0) panic("%s: no name", callfunc); #endif *vpp = NULL; if ((mode & IFMT) == 0) mode |= IFREG; if (pdir->i_effnlink < 2) { print_bad_link_count(callfunc, dvp); return (EINVAL); } error = UFS_VALLOC(dvp, mode, cnp->cn_cred, &tvp); if (error) return (error); ip = VTOI(tvp); ip->i_gid = pdir->i_gid; DIP_SET(ip, i_gid, pdir->i_gid); #ifdef SUIDDIR { #ifdef QUOTA struct ucred ucred, *ucp; gid_t ucred_group; ucp = cnp->cn_cred; #endif /* * If we are not the owner of the directory, * and we are hacking owners here, (only do this where told to) * and we are not giving it TO root, (would subvert quotas) * then go ahead and give it to the other user. * Note that this drops off the execute bits for security. */ if ((dvp->v_mount->mnt_flag & MNT_SUIDDIR) && (pdir->i_mode & ISUID) && (pdir->i_uid != cnp->cn_cred->cr_uid) && pdir->i_uid) { ip->i_uid = pdir->i_uid; DIP_SET(ip, i_uid, ip->i_uid); mode &= ~07111; #ifdef QUOTA /* * Make sure the correct user gets charged * for the space. * Quickly knock up a dummy credential for the victim. * XXX This seems to never be accessed out of our * context so a stack variable is ok. */ refcount_init(&ucred.cr_ref, 1); ucred.cr_uid = ip->i_uid; ucred.cr_ngroups = 1; ucred.cr_groups = &ucred_group; ucred.cr_groups[0] = pdir->i_gid; ucp = &ucred; #endif } else { ip->i_uid = cnp->cn_cred->cr_uid; DIP_SET(ip, i_uid, ip->i_uid); } #ifdef QUOTA if ((error = getinoquota(ip)) || (error = chkiq(ip, 1, ucp, 0))) { if (DOINGSOFTDEP(tvp)) softdep_revert_link(pdir, ip); UFS_VFREE(tvp, ip->i_number, mode); vput(tvp); return (error); } #endif } #else /* !SUIDDIR */ ip->i_uid = cnp->cn_cred->cr_uid; DIP_SET(ip, i_uid, ip->i_uid); #ifdef QUOTA if ((error = getinoquota(ip)) || (error = chkiq(ip, 1, cnp->cn_cred, 0))) { if (DOINGSOFTDEP(tvp)) softdep_revert_link(pdir, ip); UFS_VFREE(tvp, ip->i_number, mode); vput(tvp); return (error); } #endif #endif /* !SUIDDIR */ ip->i_flag |= IN_ACCESS | IN_CHANGE | IN_UPDATE; ip->i_mode = mode; DIP_SET(ip, i_mode, mode); tvp->v_type = IFTOVT(mode); /* Rest init'd in getnewvnode(). */ ip->i_effnlink = 1; ip->i_nlink = 1; DIP_SET(ip, i_nlink, 1); if (DOINGSOFTDEP(tvp)) softdep_setup_create(VTOI(dvp), ip); if ((ip->i_mode & ISGID) && !groupmember(ip->i_gid, cnp->cn_cred) && priv_check_cred(cnp->cn_cred, PRIV_VFS_SETGID, 0)) { ip->i_mode &= ~ISGID; DIP_SET(ip, i_mode, ip->i_mode); } if (cnp->cn_flags & ISWHITEOUT) { ip->i_flags |= UF_OPAQUE; DIP_SET(ip, i_flags, ip->i_flags); } /* * Make sure inode goes to disk before directory entry. */ error = UFS_UPDATE(tvp, !DOINGSOFTDEP(tvp) && !DOINGASYNC(tvp)); if (error) goto bad; #ifdef MAC if (dvp->v_mount->mnt_flag & MNT_MULTILABEL) { error = mac_vnode_create_extattr(cnp->cn_cred, dvp->v_mount, dvp, tvp, cnp); if (error) goto bad; } #endif #ifdef UFS_ACL if (dvp->v_mount->mnt_flag & MNT_ACLS) { error = ufs_do_posix1e_acl_inheritance_file(dvp, tvp, mode, cnp->cn_cred, cnp->cn_thread); if (error) goto bad; } else if (dvp->v_mount->mnt_flag & MNT_NFS4ACLS) { error = ufs_do_nfs4_acl_inheritance(dvp, tvp, mode, cnp->cn_cred, cnp->cn_thread); if (error) goto bad; } #endif /* !UFS_ACL */ ufs_makedirentry(ip, cnp, &newdir); error = ufs_direnter(dvp, tvp, &newdir, cnp, NULL, 0); if (error) goto bad; *vpp = tvp; return (0); bad: /* * Write error occurred trying to update the inode * or the directory so must deallocate the inode. */ ip->i_effnlink = 0; ip->i_nlink = 0; DIP_SET(ip, i_nlink, 0); ip->i_flag |= IN_CHANGE; if (DOINGSOFTDEP(tvp)) softdep_revert_create(VTOI(dvp), ip); vput(tvp); return (error); } static int ufs_ioctl(struct vop_ioctl_args *ap) { switch (ap->a_command) { case FIOSEEKDATA: case FIOSEEKHOLE: return (vn_bmap_seekhole(ap->a_vp, ap->a_command, (off_t *)ap->a_data, ap->a_cred)); default: return (ENOTTY); } } /* Global vfs data structures for ufs. */ struct vop_vector ufs_vnodeops = { .vop_default = &default_vnodeops, .vop_fsync = VOP_PANIC, .vop_read = VOP_PANIC, .vop_reallocblks = VOP_PANIC, .vop_write = VOP_PANIC, .vop_accessx = ufs_accessx, .vop_bmap = ufs_bmap, .vop_cachedlookup = ufs_lookup, .vop_close = ufs_close, .vop_create = ufs_create, .vop_getattr = ufs_getattr, .vop_inactive = ufs_inactive, .vop_ioctl = ufs_ioctl, .vop_link = ufs_link, .vop_lookup = vfs_cache_lookup, .vop_markatime = ufs_markatime, .vop_mkdir = ufs_mkdir, .vop_mknod = ufs_mknod, .vop_open = ufs_open, .vop_pathconf = ufs_pathconf, .vop_poll = vop_stdpoll, .vop_print = ufs_print, .vop_readdir = ufs_readdir, .vop_readlink = ufs_readlink, .vop_reclaim = ufs_reclaim, .vop_remove = ufs_remove, .vop_rename = ufs_rename, .vop_rmdir = ufs_rmdir, .vop_setattr = ufs_setattr, #ifdef MAC .vop_setlabel = vop_stdsetlabel_ea, #endif .vop_strategy = ufs_strategy, .vop_symlink = ufs_symlink, .vop_whiteout = ufs_whiteout, #ifdef UFS_EXTATTR .vop_getextattr = ufs_getextattr, .vop_deleteextattr = ufs_deleteextattr, .vop_setextattr = ufs_setextattr, #endif #ifdef UFS_ACL .vop_getacl = ufs_getacl, .vop_setacl = ufs_setacl, .vop_aclcheck = ufs_aclcheck, #endif }; struct vop_vector ufs_fifoops = { .vop_default = &fifo_specops, .vop_fsync = VOP_PANIC, .vop_accessx = ufs_accessx, .vop_close = ufsfifo_close, .vop_getattr = ufs_getattr, .vop_inactive = ufs_inactive, .vop_kqfilter = ufsfifo_kqfilter, .vop_markatime = ufs_markatime, .vop_pathconf = ufsfifo_pathconf, .vop_print = ufs_print, .vop_read = VOP_PANIC, .vop_reclaim = ufs_reclaim, .vop_setattr = ufs_setattr, #ifdef MAC .vop_setlabel = vop_stdsetlabel_ea, #endif .vop_write = VOP_PANIC, #ifdef UFS_EXTATTR .vop_getextattr = ufs_getextattr, .vop_deleteextattr = ufs_deleteextattr, .vop_setextattr = ufs_setextattr, #endif #ifdef UFS_ACL .vop_getacl = ufs_getacl, .vop_setacl = ufs_setacl, .vop_aclcheck = ufs_aclcheck, #endif }; Index: head/usr.sbin/makefs/ffs/ffs_alloc.c =================================================================== --- head/usr.sbin/makefs/ffs/ffs_alloc.c (revision 313779) +++ head/usr.sbin/makefs/ffs/ffs_alloc.c (revision 313780) @@ -1,683 +1,683 @@ /* $NetBSD: ffs_alloc.c,v 1.14 2004/06/20 22:20:18 jmc Exp $ */ /* From: NetBSD: ffs_alloc.c,v 1.50 2001/09/06 02:16:01 lukem Exp */ /* * Copyright (c) 2002 Networks Associates Technology, Inc. * All rights reserved. * * This software was developed for the FreeBSD Project by Marshall * Kirk McKusick and Network Associates Laboratories, the Security * Research Division of Network Associates, Inc. under DARPA/SPAWAR * contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA CHATS * research program * * Copyright (c) 1982, 1986, 1989, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ffs_alloc.c 8.19 (Berkeley) 7/13/95 */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include "makefs.h" #include #include #include "ffs/ufs_bswap.h" #include "ffs/buf.h" #include "ffs/ufs_inode.h" #include "ffs/ffs_extern.h" static int scanc(u_int, const u_char *, const u_char *, int); static daddr_t ffs_alloccg(struct inode *, int, daddr_t, int); static daddr_t ffs_alloccgblk(struct inode *, struct buf *, daddr_t); static daddr_t ffs_hashalloc(struct inode *, int, daddr_t, int, daddr_t (*)(struct inode *, int, daddr_t, int)); static int32_t ffs_mapsearch(struct fs *, struct cg *, daddr_t, int); /* * Allocate a block in the file system. * * The size of the requested block is given, which must be some * multiple of fs_fsize and <= fs_bsize. * A preference may be optionally specified. If a preference is given * the following hierarchy is used to allocate a block: * 1) allocate the requested block. * 2) allocate a rotationally optimal block in the same cylinder. * 3) allocate a block in the same cylinder group. * 4) quadradically rehash into other cylinder groups, until an * available block is located. * If no block preference is given the following hierarchy is used * to allocate a block: * 1) allocate a block in the cylinder group that contains the * inode for the file. * 2) quadradically rehash into other cylinder groups, until an * available block is located. */ int ffs_alloc(struct inode *ip, daddr_t lbn __unused, daddr_t bpref, int size, daddr_t *bnp) { struct fs *fs = ip->i_fs; daddr_t bno; int cg; *bnp = 0; if (size > fs->fs_bsize || fragoff(fs, size) != 0) { errx(1, "ffs_alloc: bad size: bsize %d size %d", fs->fs_bsize, size); } if (size == fs->fs_bsize && fs->fs_cstotal.cs_nbfree == 0) goto nospace; if (bpref >= fs->fs_size) bpref = 0; if (bpref == 0) cg = ino_to_cg(fs, ip->i_number); else cg = dtog(fs, bpref); bno = ffs_hashalloc(ip, cg, bpref, size, ffs_alloccg); if (bno > 0) { if (ip->i_fs->fs_magic == FS_UFS1_MAGIC) ip->i_ffs1_blocks += size / DEV_BSIZE; else ip->i_ffs2_blocks += size / DEV_BSIZE; *bnp = bno; return (0); } nospace: return (ENOSPC); } /* * Select the desired position for the next block in a file. The file is * logically divided into sections. The first section is composed of the * direct blocks. Each additional section contains fs_maxbpg blocks. * * If no blocks have been allocated in the first section, the policy is to * request a block in the same cylinder group as the inode that describes * the file. If no blocks have been allocated in any other section, the * policy is to place the section in a cylinder group with a greater than * average number of free blocks. An appropriate cylinder group is found * by using a rotor that sweeps the cylinder groups. When a new group of * blocks is needed, the sweep begins in the cylinder group following the * cylinder group from which the previous allocation was made. The sweep * continues until a cylinder group with greater than the average number * of free blocks is found. If the allocation is for the first block in an * indirect block, the information on the previous allocation is unavailable; * here a best guess is made based upon the logical block number being * allocated. * * If a section is already partially allocated, the policy is to * contiguously allocate fs_maxcontig blocks. The end of one of these * contiguous blocks and the beginning of the next is physically separated * so that the disk head will be in transit between them for at least * fs_rotdelay milliseconds. This is to allow time for the processor to * schedule another I/O transfer. */ /* XXX ondisk32 */ daddr_t ffs_blkpref_ufs1(struct inode *ip, daddr_t lbn, int indx, int32_t *bap) { struct fs *fs; int cg; int avgbfree, startcg; fs = ip->i_fs; if (indx % fs->fs_maxbpg == 0 || bap[indx - 1] == 0) { - if (lbn < NDADDR + NINDIR(fs)) { + if (lbn < UFS_NDADDR + NINDIR(fs)) { cg = ino_to_cg(fs, ip->i_number); return (fs->fs_fpg * cg + fs->fs_frag); } /* * Find a cylinder with greater than average number of * unused data blocks. */ if (indx == 0 || bap[indx - 1] == 0) startcg = ino_to_cg(fs, ip->i_number) + lbn / fs->fs_maxbpg; else startcg = dtog(fs, ufs_rw32(bap[indx - 1], UFS_FSNEEDSWAP(fs)) + 1); startcg %= fs->fs_ncg; avgbfree = fs->fs_cstotal.cs_nbfree / fs->fs_ncg; for (cg = startcg; cg < fs->fs_ncg; cg++) if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) return (fs->fs_fpg * cg + fs->fs_frag); for (cg = 0; cg <= startcg; cg++) if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) return (fs->fs_fpg * cg + fs->fs_frag); return (0); } /* * We just always try to lay things out contiguously. */ return ufs_rw32(bap[indx - 1], UFS_FSNEEDSWAP(fs)) + fs->fs_frag; } daddr_t ffs_blkpref_ufs2(struct inode *ip, daddr_t lbn, int indx, int64_t *bap) { struct fs *fs; int cg; int avgbfree, startcg; fs = ip->i_fs; if (indx % fs->fs_maxbpg == 0 || bap[indx - 1] == 0) { - if (lbn < NDADDR + NINDIR(fs)) { + if (lbn < UFS_NDADDR + NINDIR(fs)) { cg = ino_to_cg(fs, ip->i_number); return (fs->fs_fpg * cg + fs->fs_frag); } /* * Find a cylinder with greater than average number of * unused data blocks. */ if (indx == 0 || bap[indx - 1] == 0) startcg = ino_to_cg(fs, ip->i_number) + lbn / fs->fs_maxbpg; else startcg = dtog(fs, ufs_rw64(bap[indx - 1], UFS_FSNEEDSWAP(fs)) + 1); startcg %= fs->fs_ncg; avgbfree = fs->fs_cstotal.cs_nbfree / fs->fs_ncg; for (cg = startcg; cg < fs->fs_ncg; cg++) if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) { return (fs->fs_fpg * cg + fs->fs_frag); } for (cg = 0; cg < startcg; cg++) if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) { return (fs->fs_fpg * cg + fs->fs_frag); } return (0); } /* * We just always try to lay things out contiguously. */ return ufs_rw64(bap[indx - 1], UFS_FSNEEDSWAP(fs)) + fs->fs_frag; } /* * Implement the cylinder overflow algorithm. * * The policy implemented by this algorithm is: * 1) allocate the block in its requested cylinder group. * 2) quadradically rehash on the cylinder group number. * 3) brute force search for a free block. * * `size': size for data blocks, mode for inodes */ /*VARARGS5*/ static daddr_t ffs_hashalloc(struct inode *ip, int cg, daddr_t pref, int size, daddr_t (*allocator)(struct inode *, int, daddr_t, int)) { struct fs *fs; daddr_t result; int i, icg = cg; fs = ip->i_fs; /* * 1: preferred cylinder group */ result = (*allocator)(ip, cg, pref, size); if (result) return (result); /* * 2: quadratic rehash */ for (i = 1; i < fs->fs_ncg; i *= 2) { cg += i; if (cg >= fs->fs_ncg) cg -= fs->fs_ncg; result = (*allocator)(ip, cg, 0, size); if (result) return (result); } /* * 3: brute force search * Note that we start at i == 2, since 0 was checked initially, * and 1 is always checked in the quadratic rehash. */ cg = (icg + 2) % fs->fs_ncg; for (i = 2; i < fs->fs_ncg; i++) { result = (*allocator)(ip, cg, 0, size); if (result) return (result); cg++; if (cg == fs->fs_ncg) cg = 0; } return (0); } /* * Determine whether a block can be allocated. * * Check to see if a block of the appropriate size is available, * and if it is, allocate it. */ static daddr_t ffs_alloccg(struct inode *ip, int cg, daddr_t bpref, int size) { struct cg *cgp; struct buf *bp; daddr_t bno, blkno; int error, frags, allocsiz, i; struct fs *fs = ip->i_fs; const int needswap = UFS_FSNEEDSWAP(fs); struct vnode vp = { ip->i_fd, ip->i_fs, NULL, 0 }; if (fs->fs_cs(fs, cg).cs_nbfree == 0 && size == fs->fs_bsize) return (0); error = bread(&vp, fsbtodb(fs, cgtod(fs, cg)), (int)fs->fs_cgsize, NULL, &bp); if (error) { brelse(bp, 0); return (0); } cgp = (struct cg *)bp->b_data; if (!cg_chkmagic_swap(cgp, needswap) || (cgp->cg_cs.cs_nbfree == 0 && size == fs->fs_bsize)) { brelse(bp, 0); return (0); } if (size == fs->fs_bsize) { bno = ffs_alloccgblk(ip, bp, bpref); bdwrite(bp); return (bno); } /* * check to see if any fragments are already available * allocsiz is the size which will be allocated, hacking * it down to a smaller size if necessary */ frags = numfrags(fs, size); for (allocsiz = frags; allocsiz < fs->fs_frag; allocsiz++) if (cgp->cg_frsum[allocsiz] != 0) break; if (allocsiz == fs->fs_frag) { /* * no fragments were available, so a block will be * allocated, and hacked up */ if (cgp->cg_cs.cs_nbfree == 0) { brelse(bp, 0); return (0); } bno = ffs_alloccgblk(ip, bp, bpref); bpref = dtogd(fs, bno); for (i = frags; i < fs->fs_frag; i++) setbit(cg_blksfree_swap(cgp, needswap), bpref + i); i = fs->fs_frag - frags; ufs_add32(cgp->cg_cs.cs_nffree, i, needswap); fs->fs_cstotal.cs_nffree += i; fs->fs_cs(fs, cg).cs_nffree += i; fs->fs_fmod = 1; ufs_add32(cgp->cg_frsum[i], 1, needswap); bdwrite(bp); return (bno); } bno = ffs_mapsearch(fs, cgp, bpref, allocsiz); for (i = 0; i < frags; i++) clrbit(cg_blksfree_swap(cgp, needswap), bno + i); ufs_add32(cgp->cg_cs.cs_nffree, -frags, needswap); fs->fs_cstotal.cs_nffree -= frags; fs->fs_cs(fs, cg).cs_nffree -= frags; fs->fs_fmod = 1; ufs_add32(cgp->cg_frsum[allocsiz], -1, needswap); if (frags != allocsiz) ufs_add32(cgp->cg_frsum[allocsiz - frags], 1, needswap); blkno = cg * fs->fs_fpg + bno; bdwrite(bp); return blkno; } /* * Allocate a block in a cylinder group. * * This algorithm implements the following policy: * 1) allocate the requested block. * 2) allocate a rotationally optimal block in the same cylinder. * 3) allocate the next available block on the block rotor for the * specified cylinder group. * Note that this routine only allocates fs_bsize blocks; these * blocks may be fragmented by the routine that allocates them. */ static daddr_t ffs_alloccgblk(struct inode *ip, struct buf *bp, daddr_t bpref) { struct cg *cgp; daddr_t blkno; int32_t bno; struct fs *fs = ip->i_fs; const int needswap = UFS_FSNEEDSWAP(fs); u_int8_t *blksfree_swap; cgp = (struct cg *)bp->b_data; blksfree_swap = cg_blksfree_swap(cgp, needswap); if (bpref == 0 || (uint32_t)dtog(fs, bpref) != ufs_rw32(cgp->cg_cgx, needswap)) { bpref = ufs_rw32(cgp->cg_rotor, needswap); } else { bpref = blknum(fs, bpref); bno = dtogd(fs, bpref); /* * if the requested block is available, use it */ if (ffs_isblock(fs, blksfree_swap, fragstoblks(fs, bno))) goto gotit; } /* * Take the next available one in this cylinder group. */ bno = ffs_mapsearch(fs, cgp, bpref, (int)fs->fs_frag); if (bno < 0) return (0); cgp->cg_rotor = ufs_rw32(bno, needswap); gotit: blkno = fragstoblks(fs, bno); ffs_clrblock(fs, blksfree_swap, (long)blkno); ffs_clusteracct(fs, cgp, blkno, -1); ufs_add32(cgp->cg_cs.cs_nbfree, -1, needswap); fs->fs_cstotal.cs_nbfree--; fs->fs_cs(fs, ufs_rw32(cgp->cg_cgx, needswap)).cs_nbfree--; fs->fs_fmod = 1; blkno = ufs_rw32(cgp->cg_cgx, needswap) * fs->fs_fpg + bno; return (blkno); } /* * Free a block or fragment. * * The specified block or fragment is placed back in the * free map. If a fragment is deallocated, a possible * block reassembly is checked. */ void ffs_blkfree(struct inode *ip, daddr_t bno, long size) { struct cg *cgp; struct buf *bp; int32_t fragno, cgbno; int i, error, cg, blk, frags, bbase; struct fs *fs = ip->i_fs; const int needswap = UFS_FSNEEDSWAP(fs); struct vnode vp = { ip->i_fd, ip->i_fs, NULL, 0 }; if (size > fs->fs_bsize || fragoff(fs, size) != 0 || fragnum(fs, bno) + numfrags(fs, size) > fs->fs_frag) { errx(1, "blkfree: bad size: bno %lld bsize %d size %ld", (long long)bno, fs->fs_bsize, size); } cg = dtog(fs, bno); if (bno >= fs->fs_size) { warnx("bad block %lld, ino %ju", (long long)bno, (uintmax_t)ip->i_number); return; } error = bread(&vp, fsbtodb(fs, cgtod(fs, cg)), (int)fs->fs_cgsize, NULL, &bp); if (error) { brelse(bp, 0); return; } cgp = (struct cg *)bp->b_data; if (!cg_chkmagic_swap(cgp, needswap)) { brelse(bp, 0); return; } cgbno = dtogd(fs, bno); if (size == fs->fs_bsize) { fragno = fragstoblks(fs, cgbno); if (!ffs_isfreeblock(fs, cg_blksfree_swap(cgp, needswap), fragno)) { errx(1, "blkfree: freeing free block %lld", (long long)bno); } ffs_setblock(fs, cg_blksfree_swap(cgp, needswap), fragno); ffs_clusteracct(fs, cgp, fragno, 1); ufs_add32(cgp->cg_cs.cs_nbfree, 1, needswap); fs->fs_cstotal.cs_nbfree++; fs->fs_cs(fs, cg).cs_nbfree++; } else { bbase = cgbno - fragnum(fs, cgbno); /* * decrement the counts associated with the old frags */ blk = blkmap(fs, cg_blksfree_swap(cgp, needswap), bbase); ffs_fragacct_swap(fs, blk, cgp->cg_frsum, -1, needswap); /* * deallocate the fragment */ frags = numfrags(fs, size); for (i = 0; i < frags; i++) { if (isset(cg_blksfree_swap(cgp, needswap), cgbno + i)) { errx(1, "blkfree: freeing free frag: block %lld", (long long)(cgbno + i)); } setbit(cg_blksfree_swap(cgp, needswap), cgbno + i); } ufs_add32(cgp->cg_cs.cs_nffree, i, needswap); fs->fs_cstotal.cs_nffree += i; fs->fs_cs(fs, cg).cs_nffree += i; /* * add back in counts associated with the new frags */ blk = blkmap(fs, cg_blksfree_swap(cgp, needswap), bbase); ffs_fragacct_swap(fs, blk, cgp->cg_frsum, 1, needswap); /* * if a complete block has been reassembled, account for it */ fragno = fragstoblks(fs, bbase); if (ffs_isblock(fs, cg_blksfree_swap(cgp, needswap), fragno)) { ufs_add32(cgp->cg_cs.cs_nffree, -fs->fs_frag, needswap); fs->fs_cstotal.cs_nffree -= fs->fs_frag; fs->fs_cs(fs, cg).cs_nffree -= fs->fs_frag; ffs_clusteracct(fs, cgp, fragno, 1); ufs_add32(cgp->cg_cs.cs_nbfree, 1, needswap); fs->fs_cstotal.cs_nbfree++; fs->fs_cs(fs, cg).cs_nbfree++; } } fs->fs_fmod = 1; bdwrite(bp); } static int scanc(u_int size, const u_char *cp, const u_char table[], int mask) { const u_char *end = &cp[size]; while (cp < end && (table[*cp] & mask) == 0) cp++; return (end - cp); } /* * Find a block of the specified size in the specified cylinder group. * * It is a panic if a request is made to find a block if none are * available. */ static int32_t ffs_mapsearch(struct fs *fs, struct cg *cgp, daddr_t bpref, int allocsiz) { int32_t bno; int start, len, loc, i; int blk, field, subfield, pos; int ostart, olen; const int needswap = UFS_FSNEEDSWAP(fs); /* * find the fragment by searching through the free block * map for an appropriate bit pattern */ if (bpref) start = dtogd(fs, bpref) / NBBY; else start = ufs_rw32(cgp->cg_frotor, needswap) / NBBY; len = howmany(fs->fs_fpg, NBBY) - start; ostart = start; olen = len; loc = scanc((u_int)len, (const u_char *)&cg_blksfree_swap(cgp, needswap)[start], (const u_char *)fragtbl[fs->fs_frag], (1 << (allocsiz - 1 + (fs->fs_frag % NBBY)))); if (loc == 0) { len = start + 1; start = 0; loc = scanc((u_int)len, (const u_char *)&cg_blksfree_swap(cgp, needswap)[0], (const u_char *)fragtbl[fs->fs_frag], (1 << (allocsiz - 1 + (fs->fs_frag % NBBY)))); if (loc == 0) { errx(1, "ffs_alloccg: map corrupted: start %d len %d offset %d %ld", ostart, olen, ufs_rw32(cgp->cg_freeoff, needswap), (long)cg_blksfree_swap(cgp, needswap) - (long)cgp); /* NOTREACHED */ } } bno = (start + len - loc) * NBBY; cgp->cg_frotor = ufs_rw32(bno, needswap); /* * found the byte in the map * sift through the bits to find the selected frag */ for (i = bno + NBBY; bno < i; bno += fs->fs_frag) { blk = blkmap(fs, cg_blksfree_swap(cgp, needswap), bno); blk <<= 1; field = around[allocsiz]; subfield = inside[allocsiz]; for (pos = 0; pos <= fs->fs_frag - allocsiz; pos++) { if ((blk & field) == subfield) return (bno + pos); field <<= 1; subfield <<= 1; } } errx(1, "ffs_alloccg: block not in map: bno %lld", (long long)bno); return (-1); } /* * Update the cluster map because of an allocation or free. * * Cnt == 1 means free; cnt == -1 means allocating. */ void ffs_clusteracct(struct fs *fs, struct cg *cgp, int32_t blkno, int cnt) { int32_t *sump; int32_t *lp; u_char *freemapp, *mapp; int i, start, end, forw, back, map, bit; const int needswap = UFS_FSNEEDSWAP(fs); if (fs->fs_contigsumsize <= 0) return; freemapp = cg_clustersfree_swap(cgp, needswap); sump = cg_clustersum_swap(cgp, needswap); /* * Allocate or clear the actual block. */ if (cnt > 0) setbit(freemapp, blkno); else clrbit(freemapp, blkno); /* * Find the size of the cluster going forward. */ start = blkno + 1; end = start + fs->fs_contigsumsize; if ((unsigned)end >= ufs_rw32(cgp->cg_nclusterblks, needswap)) end = ufs_rw32(cgp->cg_nclusterblks, needswap); mapp = &freemapp[start / NBBY]; map = *mapp++; bit = 1 << (start % NBBY); for (i = start; i < end; i++) { if ((map & bit) == 0) break; if ((i & (NBBY - 1)) != (NBBY - 1)) { bit <<= 1; } else { map = *mapp++; bit = 1; } } forw = i - start; /* * Find the size of the cluster going backward. */ start = blkno - 1; end = start - fs->fs_contigsumsize; if (end < 0) end = -1; mapp = &freemapp[start / NBBY]; map = *mapp--; bit = 1 << (start % NBBY); for (i = start; i > end; i--) { if ((map & bit) == 0) break; if ((i & (NBBY - 1)) != 0) { bit >>= 1; } else { map = *mapp--; bit = 1 << (NBBY - 1); } } back = start - i; /* * Account for old cluster and the possibly new forward and * back clusters. */ i = back + forw + 1; if (i > fs->fs_contigsumsize) i = fs->fs_contigsumsize; ufs_add32(sump[i], cnt, needswap); if (back > 0) ufs_add32(sump[back], -cnt, needswap); if (forw > 0) ufs_add32(sump[forw], -cnt, needswap); /* * Update cluster summary information. */ lp = &sump[fs->fs_contigsumsize]; for (i = fs->fs_contigsumsize; i > 0; i--) if (ufs_rw32(*lp--, needswap) > 0) break; fs->fs_maxcluster[ufs_rw32(cgp->cg_cgx, needswap)] = i; } Index: head/usr.sbin/makefs/ffs/ffs_balloc.c =================================================================== --- head/usr.sbin/makefs/ffs/ffs_balloc.c (revision 313779) +++ head/usr.sbin/makefs/ffs/ffs_balloc.c (revision 313780) @@ -1,576 +1,576 @@ /* $NetBSD: ffs_balloc.c,v 1.13 2004/06/20 22:20:18 jmc Exp $ */ /* From NetBSD: ffs_balloc.c,v 1.25 2001/08/08 08:36:36 lukem Exp */ /* * Copyright (c) 1982, 1986, 1989, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ffs_balloc.c 8.8 (Berkeley) 6/16/95 */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include "makefs.h" #include #include #include "ffs/ufs_bswap.h" #include "ffs/buf.h" #include "ffs/ufs_inode.h" #include "ffs/ffs_extern.h" static int ffs_balloc_ufs1(struct inode *, off_t, int, struct buf **); static int ffs_balloc_ufs2(struct inode *, off_t, int, struct buf **); /* * Balloc defines the structure of file system storage * by allocating the physical blocks on a device given * the inode and the logical block number in a file. * * Assume: flags == B_SYNC | B_CLRBUF */ int ffs_balloc(struct inode *ip, off_t offset, int bufsize, struct buf **bpp) { if (ip->i_fs->fs_magic == FS_UFS2_MAGIC) return ffs_balloc_ufs2(ip, offset, bufsize, bpp); else return ffs_balloc_ufs1(ip, offset, bufsize, bpp); } static int ffs_balloc_ufs1(struct inode *ip, off_t offset, int bufsize, struct buf **bpp) { daddr_t lbn, lastlbn; int size; int32_t nb; struct buf *bp, *nbp; struct fs *fs = ip->i_fs; - struct indir indirs[NIADDR + 2]; + struct indir indirs[UFS_NIADDR + 2]; daddr_t newb, pref; int32_t *bap; int osize, nsize, num, i, error; - int32_t *allocblk, allociblk[NIADDR + 1]; + int32_t *allocblk, allociblk[UFS_NIADDR + 1]; int32_t *allocib; const int needswap = UFS_FSNEEDSWAP(fs); struct vnode vp = { ip->i_fd, ip->i_fs, NULL, 0 }; lbn = lblkno(fs, offset); size = blkoff(fs, offset) + bufsize; if (bpp != NULL) { *bpp = NULL; } assert(size <= fs->fs_bsize); if (lbn < 0) return (EFBIG); /* * If the next write will extend the file into a new block, * and the file is currently composed of a fragment * this fragment has to be extended to be a full block. */ lastlbn = lblkno(fs, ip->i_ffs1_size); - if (lastlbn < NDADDR && lastlbn < lbn) { + if (lastlbn < UFS_NDADDR && lastlbn < lbn) { nb = lastlbn; osize = blksize(fs, ip, nb); if (osize < fs->fs_bsize && osize > 0) { warnx("need to ffs_realloccg; not supported!"); abort(); } } /* - * The first NDADDR blocks are direct blocks + * The first UFS_NDADDR blocks are direct blocks */ - if (lbn < NDADDR) { + if (lbn < UFS_NDADDR) { nb = ufs_rw32(ip->i_ffs1_db[lbn], needswap); if (nb != 0 && ip->i_ffs1_size >= lblktosize(fs, lbn + 1)) { /* * The block is an already-allocated direct block * and the file already extends past this block, * thus this must be a whole block. * Just read the block (if requested). */ if (bpp != NULL) { error = bread(&vp, lbn, fs->fs_bsize, NULL, bpp); if (error) { brelse(*bpp, 0); return (error); } } return (0); } if (nb != 0) { /* * Consider need to reallocate a fragment. */ osize = fragroundup(fs, blkoff(fs, ip->i_ffs1_size)); nsize = fragroundup(fs, size); if (nsize <= osize) { /* * The existing block is already * at least as big as we want. * Just read the block (if requested). */ if (bpp != NULL) { error = bread(&vp, lbn, osize, NULL, bpp); if (error) { brelse(*bpp, 0); return (error); } } return 0; } else { warnx("need to ffs_realloccg; not supported!"); abort(); } } else { /* * the block was not previously allocated, * allocate a new block or fragment. */ if (ip->i_ffs1_size < lblktosize(fs, lbn + 1)) nsize = fragroundup(fs, size); else nsize = fs->fs_bsize; error = ffs_alloc(ip, lbn, ffs_blkpref_ufs1(ip, lbn, (int)lbn, &ip->i_ffs1_db[0]), nsize, &newb); if (error) return (error); if (bpp != NULL) { bp = getblk(&vp, lbn, nsize, 0, 0, 0); bp->b_blkno = fsbtodb(fs, newb); clrbuf(bp); *bpp = bp; } } ip->i_ffs1_db[lbn] = ufs_rw32((int32_t)newb, needswap); return (0); } /* * Determine the number of levels of indirection. */ pref = 0; if ((error = ufs_getlbns(ip, lbn, indirs, &num)) != 0) return (error); if (num < 1) { warnx("ffs_balloc: ufs_getlbns returned indirect block"); abort(); } /* * Fetch the first indirect block allocating if necessary. */ --num; nb = ufs_rw32(ip->i_ffs1_ib[indirs[0].in_off], needswap); allocib = NULL; allocblk = allociblk; if (nb == 0) { pref = ffs_blkpref_ufs1(ip, lbn, 0, (int32_t *)0); error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize, &newb); if (error) return error; nb = newb; *allocblk++ = nb; bp = getblk(&vp, indirs[1].in_lbn, fs->fs_bsize, 0, 0, 0); bp->b_blkno = fsbtodb(fs, nb); clrbuf(bp); /* * Write synchronously so that indirect blocks * never point at garbage. */ if ((error = bwrite(bp)) != 0) return error; allocib = &ip->i_ffs1_ib[indirs[0].in_off]; *allocib = ufs_rw32((int32_t)nb, needswap); } /* * Fetch through the indirect blocks, allocating as necessary. */ for (i = 1;;) { error = bread(&vp, indirs[i].in_lbn, fs->fs_bsize, NULL, &bp); if (error) { brelse(bp, 0); return error; } bap = (int32_t *)bp->b_data; nb = ufs_rw32(bap[indirs[i].in_off], needswap); if (i == num) break; i++; if (nb != 0) { brelse(bp, 0); continue; } if (pref == 0) pref = ffs_blkpref_ufs1(ip, lbn, 0, (int32_t *)0); error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize, &newb); if (error) { brelse(bp, 0); return error; } nb = newb; *allocblk++ = nb; nbp = getblk(&vp, indirs[i].in_lbn, fs->fs_bsize, 0, 0, 0); nbp->b_blkno = fsbtodb(fs, nb); clrbuf(nbp); /* * Write synchronously so that indirect blocks * never point at garbage. */ if ((error = bwrite(nbp)) != 0) { brelse(bp, 0); return error; } bap[indirs[i - 1].in_off] = ufs_rw32(nb, needswap); bwrite(bp); } /* * Get the data block, allocating if necessary. */ if (nb == 0) { pref = ffs_blkpref_ufs1(ip, lbn, indirs[num].in_off, &bap[0]); error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize, &newb); if (error) { brelse(bp, 0); return error; } nb = newb; *allocblk++ = nb; if (bpp != NULL) { nbp = getblk(&vp, lbn, fs->fs_bsize, 0, 0, 0); nbp->b_blkno = fsbtodb(fs, nb); clrbuf(nbp); *bpp = nbp; } bap[indirs[num].in_off] = ufs_rw32(nb, needswap); /* * If required, write synchronously, otherwise use * delayed write. */ bwrite(bp); return (0); } brelse(bp, 0); if (bpp != NULL) { error = bread(&vp, lbn, (int)fs->fs_bsize, NULL, &nbp); if (error) { brelse(nbp, 0); return error; } *bpp = nbp; } return (0); } static int ffs_balloc_ufs2(struct inode *ip, off_t offset, int bufsize, struct buf **bpp) { daddr_t lbn, lastlbn; int size; struct buf *bp, *nbp; struct fs *fs = ip->i_fs; - struct indir indirs[NIADDR + 2]; + struct indir indirs[UFS_NIADDR + 2]; daddr_t newb, pref, nb; int64_t *bap; int osize, nsize, num, i, error; - int64_t *allocblk, allociblk[NIADDR + 1]; + int64_t *allocblk, allociblk[UFS_NIADDR + 1]; int64_t *allocib; const int needswap = UFS_FSNEEDSWAP(fs); struct vnode vp = { ip->i_fd, ip->i_fs, NULL, 0 }; lbn = lblkno(fs, offset); size = blkoff(fs, offset) + bufsize; if (bpp != NULL) { *bpp = NULL; } assert(size <= fs->fs_bsize); if (lbn < 0) return (EFBIG); /* * If the next write will extend the file into a new block, * and the file is currently composed of a fragment * this fragment has to be extended to be a full block. */ lastlbn = lblkno(fs, ip->i_ffs2_size); - if (lastlbn < NDADDR && lastlbn < lbn) { + if (lastlbn < UFS_NDADDR && lastlbn < lbn) { nb = lastlbn; osize = blksize(fs, ip, nb); if (osize < fs->fs_bsize && osize > 0) { warnx("need to ffs_realloccg; not supported!"); abort(); } } /* - * The first NDADDR blocks are direct blocks + * The first UFS_NDADDR blocks are direct blocks */ - if (lbn < NDADDR) { + if (lbn < UFS_NDADDR) { nb = ufs_rw64(ip->i_ffs2_db[lbn], needswap); if (nb != 0 && ip->i_ffs2_size >= lblktosize(fs, lbn + 1)) { /* * The block is an already-allocated direct block * and the file already extends past this block, * thus this must be a whole block. * Just read the block (if requested). */ if (bpp != NULL) { error = bread(&vp, lbn, fs->fs_bsize, NULL, bpp); if (error) { brelse(*bpp, 0); return (error); } } return (0); } if (nb != 0) { /* * Consider need to reallocate a fragment. */ osize = fragroundup(fs, blkoff(fs, ip->i_ffs2_size)); nsize = fragroundup(fs, size); if (nsize <= osize) { /* * The existing block is already * at least as big as we want. * Just read the block (if requested). */ if (bpp != NULL) { error = bread(&vp, lbn, osize, NULL, bpp); if (error) { brelse(*bpp, 0); return (error); } } return 0; } else { warnx("need to ffs_realloccg; not supported!"); abort(); } } else { /* * the block was not previously allocated, * allocate a new block or fragment. */ if (ip->i_ffs2_size < lblktosize(fs, lbn + 1)) nsize = fragroundup(fs, size); else nsize = fs->fs_bsize; error = ffs_alloc(ip, lbn, ffs_blkpref_ufs2(ip, lbn, (int)lbn, &ip->i_ffs2_db[0]), nsize, &newb); if (error) return (error); if (bpp != NULL) { bp = getblk(&vp, lbn, nsize, 0, 0, 0); bp->b_blkno = fsbtodb(fs, newb); clrbuf(bp); *bpp = bp; } } ip->i_ffs2_db[lbn] = ufs_rw64(newb, needswap); return (0); } /* * Determine the number of levels of indirection. */ pref = 0; if ((error = ufs_getlbns(ip, lbn, indirs, &num)) != 0) return (error); if (num < 1) { warnx("ffs_balloc: ufs_getlbns returned indirect block"); abort(); } /* * Fetch the first indirect block allocating if necessary. */ --num; nb = ufs_rw64(ip->i_ffs2_ib[indirs[0].in_off], needswap); allocib = NULL; allocblk = allociblk; if (nb == 0) { pref = ffs_blkpref_ufs2(ip, lbn, 0, (int64_t *)0); error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize, &newb); if (error) return error; nb = newb; *allocblk++ = nb; bp = getblk(&vp, indirs[1].in_lbn, fs->fs_bsize, 0, 0, 0); bp->b_blkno = fsbtodb(fs, nb); clrbuf(bp); /* * Write synchronously so that indirect blocks * never point at garbage. */ if ((error = bwrite(bp)) != 0) return error; allocib = &ip->i_ffs2_ib[indirs[0].in_off]; *allocib = ufs_rw64(nb, needswap); } /* * Fetch through the indirect blocks, allocating as necessary. */ for (i = 1;;) { error = bread(&vp, indirs[i].in_lbn, fs->fs_bsize, NULL, &bp); if (error) { brelse(bp, 0); return error; } bap = (int64_t *)bp->b_data; nb = ufs_rw64(bap[indirs[i].in_off], needswap); if (i == num) break; i++; if (nb != 0) { brelse(bp, 0); continue; } if (pref == 0) pref = ffs_blkpref_ufs2(ip, lbn, 0, (int64_t *)0); error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize, &newb); if (error) { brelse(bp, 0); return error; } nb = newb; *allocblk++ = nb; nbp = getblk(&vp, indirs[i].in_lbn, fs->fs_bsize, 0, 0, 0); nbp->b_blkno = fsbtodb(fs, nb); clrbuf(nbp); /* * Write synchronously so that indirect blocks * never point at garbage. */ if ((error = bwrite(nbp)) != 0) { brelse(bp, 0); return error; } bap[indirs[i - 1].in_off] = ufs_rw64(nb, needswap); bwrite(bp); } /* * Get the data block, allocating if necessary. */ if (nb == 0) { pref = ffs_blkpref_ufs2(ip, lbn, indirs[num].in_off, &bap[0]); error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize, &newb); if (error) { brelse(bp, 0); return error; } nb = newb; *allocblk++ = nb; if (bpp != NULL) { nbp = getblk(&vp, lbn, fs->fs_bsize, 0, 0, 0); nbp->b_blkno = fsbtodb(fs, nb); clrbuf(nbp); *bpp = nbp; } bap[indirs[num].in_off] = ufs_rw64(nb, needswap); /* * If required, write synchronously, otherwise use * delayed write. */ bwrite(bp); return (0); } brelse(bp, 0); if (bpp != NULL) { error = bread(&vp, lbn, (int)fs->fs_bsize, NULL, &nbp); if (error) { brelse(nbp, 0); return error; } *bpp = nbp; } return (0); } Index: head/usr.sbin/makefs/ffs/mkfs.c =================================================================== --- head/usr.sbin/makefs/ffs/mkfs.c (revision 313779) +++ head/usr.sbin/makefs/ffs/mkfs.c (revision 313780) @@ -1,838 +1,839 @@ /* $NetBSD: mkfs.c,v 1.22 2011/10/09 22:30:13 christos Exp $ */ /* * Copyright (c) 2002 Networks Associates Technology, Inc. * All rights reserved. * * This software was developed for the FreeBSD Project by Marshall * Kirk McKusick and Network Associates Laboratories, the Security * Research Division of Network Associates, Inc. under DARPA/SPAWAR * contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA CHATS * research program * * Copyright (c) 1980, 1989, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include "makefs.h" #include "ffs.h" #include #include #include "ffs/ufs_bswap.h" #include "ffs/ufs_inode.h" #include "ffs/ffs_extern.h" #include "ffs/newfs_extern.h" #ifndef BBSIZE #define BBSIZE 8192 /* size of boot area, with label */ #endif static void initcg(int, time_t, const fsinfo_t *); static int ilog2(int); static int count_digits(int); /* * make file system for cylinder-group style file systems */ #define UMASK 0755 #define POWEROF2(num) (((num) & ((num) - 1)) == 0) union { struct fs fs; char pad[SBLOCKSIZE]; } fsun; #define sblock fsun.fs struct csum *fscs; union { struct cg cg; char pad[FFS_MAXBSIZE]; } cgun; #define acg cgun.cg char *iobuf; int iobufsize; char writebuf[FFS_MAXBSIZE]; static int Oflag; /* format as an 4.3BSD file system */ static int64_t fssize; /* file system size */ static int sectorsize; /* bytes/sector */ static int fsize; /* fragment size */ static int bsize; /* block size */ static int maxbsize; /* maximum clustering */ static int maxblkspercg; static int minfree; /* free space threshold */ static int opt; /* optimization preference (space or time) */ static int density; /* number of bytes per inode */ static int maxcontig; /* max contiguous blocks to allocate */ static int maxbpg; /* maximum blocks per file in a cyl group */ static int bbsize; /* boot block size */ static int sbsize; /* superblock size */ static int avgfilesize; /* expected average file size */ static int avgfpdir; /* expected number of files per directory */ struct fs * ffs_mkfs(const char *fsys, const fsinfo_t *fsopts, time_t tstamp) { int fragsperinode, optimalfpg, origdensity, minfpg, lastminfpg; int32_t cylno, i, csfrags; long long sizepb; void *space; int size, blks; int nprintcols, printcolwidth; ffs_opt_t *ffs_opts = fsopts->fs_specific; Oflag = ffs_opts->version; fssize = fsopts->size / fsopts->sectorsize; sectorsize = fsopts->sectorsize; fsize = ffs_opts->fsize; bsize = ffs_opts->bsize; maxbsize = ffs_opts->maxbsize; maxblkspercg = ffs_opts->maxblkspercg; minfree = ffs_opts->minfree; opt = ffs_opts->optimization; density = ffs_opts->density; maxcontig = ffs_opts->maxcontig; maxbpg = ffs_opts->maxbpg; avgfilesize = ffs_opts->avgfilesize; avgfpdir = ffs_opts->avgfpdir; bbsize = BBSIZE; sbsize = SBLOCKSIZE; strlcpy(sblock.fs_volname, ffs_opts->label, sizeof(sblock.fs_volname)); if (Oflag == 0) { sblock.fs_old_inodefmt = FS_42INODEFMT; sblock.fs_maxsymlinklen = 0; sblock.fs_old_flags = 0; } else { sblock.fs_old_inodefmt = FS_44INODEFMT; - sblock.fs_maxsymlinklen = (Oflag == 1 ? MAXSYMLINKLEN_UFS1 : - MAXSYMLINKLEN_UFS2); + sblock.fs_maxsymlinklen = (Oflag == 1 ? UFS1_MAXSYMLINKLEN : + UFS2_MAXSYMLINKLEN); sblock.fs_old_flags = FS_FLAGS_UPDATED; sblock.fs_flags = 0; } /* * Validate the given file system size. * Verify that its last block can actually be accessed. * Convert to file system fragment sized units. */ if (fssize <= 0) { printf("preposterous size %lld\n", (long long)fssize); exit(13); } ffs_wtfs(fssize - 1, sectorsize, (char *)&sblock, fsopts); /* * collect and verify the filesystem density info */ sblock.fs_avgfilesize = avgfilesize; sblock.fs_avgfpdir = avgfpdir; if (sblock.fs_avgfilesize <= 0) printf("illegal expected average file size %d\n", sblock.fs_avgfilesize), exit(14); if (sblock.fs_avgfpdir <= 0) printf("illegal expected number of files per directory %d\n", sblock.fs_avgfpdir), exit(15); /* * collect and verify the block and fragment sizes */ sblock.fs_bsize = bsize; sblock.fs_fsize = fsize; if (!POWEROF2(sblock.fs_bsize)) { printf("block size must be a power of 2, not %d\n", sblock.fs_bsize); exit(16); } if (!POWEROF2(sblock.fs_fsize)) { printf("fragment size must be a power of 2, not %d\n", sblock.fs_fsize); exit(17); } if (sblock.fs_fsize < sectorsize) { printf("fragment size %d is too small, minimum is %d\n", sblock.fs_fsize, sectorsize); exit(18); } if (sblock.fs_bsize < MINBSIZE) { printf("block size %d is too small, minimum is %d\n", sblock.fs_bsize, MINBSIZE); exit(19); } if (sblock.fs_bsize > FFS_MAXBSIZE) { printf("block size %d is too large, maximum is %d\n", sblock.fs_bsize, FFS_MAXBSIZE); exit(19); } if (sblock.fs_bsize < sblock.fs_fsize) { printf("block size (%d) cannot be smaller than fragment size (%d)\n", sblock.fs_bsize, sblock.fs_fsize); exit(20); } if (maxbsize < bsize || !POWEROF2(maxbsize)) { sblock.fs_maxbsize = sblock.fs_bsize; printf("Extent size set to %d\n", sblock.fs_maxbsize); } else if (sblock.fs_maxbsize > FS_MAXCONTIG * sblock.fs_bsize) { sblock.fs_maxbsize = FS_MAXCONTIG * sblock.fs_bsize; printf("Extent size reduced to %d\n", sblock.fs_maxbsize); } else { sblock.fs_maxbsize = maxbsize; } sblock.fs_maxcontig = maxcontig; if (sblock.fs_maxcontig < sblock.fs_maxbsize / sblock.fs_bsize) { sblock.fs_maxcontig = sblock.fs_maxbsize / sblock.fs_bsize; printf("Maxcontig raised to %d\n", sblock.fs_maxbsize); } if (sblock.fs_maxcontig > 1) sblock.fs_contigsumsize = MIN(sblock.fs_maxcontig,FS_MAXCONTIG); sblock.fs_bmask = ~(sblock.fs_bsize - 1); sblock.fs_fmask = ~(sblock.fs_fsize - 1); sblock.fs_qbmask = ~sblock.fs_bmask; sblock.fs_qfmask = ~sblock.fs_fmask; for (sblock.fs_bshift = 0, i = sblock.fs_bsize; i > 1; i >>= 1) sblock.fs_bshift++; for (sblock.fs_fshift = 0, i = sblock.fs_fsize; i > 1; i >>= 1) sblock.fs_fshift++; sblock.fs_frag = numfrags(&sblock, sblock.fs_bsize); for (sblock.fs_fragshift = 0, i = sblock.fs_frag; i > 1; i >>= 1) sblock.fs_fragshift++; if (sblock.fs_frag > MAXFRAG) { printf("fragment size %d is too small, " "minimum with block size %d is %d\n", sblock.fs_fsize, sblock.fs_bsize, sblock.fs_bsize / MAXFRAG); exit(21); } sblock.fs_fsbtodb = ilog2(sblock.fs_fsize / sectorsize); sblock.fs_size = sblock.fs_providersize = fssize = dbtofsb(&sblock, fssize); if (Oflag <= 1) { sblock.fs_magic = FS_UFS1_MAGIC; sblock.fs_sblockloc = SBLOCK_UFS1; sblock.fs_nindir = sblock.fs_bsize / sizeof(ufs1_daddr_t); sblock.fs_inopb = sblock.fs_bsize / sizeof(struct ufs1_dinode); - sblock.fs_maxsymlinklen = ((NDADDR + NIADDR) * + sblock.fs_maxsymlinklen = ((UFS_NDADDR + UFS_NIADDR) * sizeof (ufs1_daddr_t)); sblock.fs_old_inodefmt = FS_44INODEFMT; sblock.fs_old_cgoffset = 0; sblock.fs_old_cgmask = 0xffffffff; sblock.fs_old_size = sblock.fs_size; sblock.fs_old_rotdelay = 0; sblock.fs_old_rps = 60; sblock.fs_old_nspf = sblock.fs_fsize / sectorsize; sblock.fs_old_cpg = 1; sblock.fs_old_interleave = 1; sblock.fs_old_trackskew = 0; sblock.fs_old_cpc = 0; sblock.fs_old_postblformat = 1; sblock.fs_old_nrpos = 1; } else { sblock.fs_magic = FS_UFS2_MAGIC; sblock.fs_sblockloc = SBLOCK_UFS2; sblock.fs_nindir = sblock.fs_bsize / sizeof(ufs2_daddr_t); sblock.fs_inopb = sblock.fs_bsize / sizeof(struct ufs2_dinode); - sblock.fs_maxsymlinklen = ((NDADDR + NIADDR) * + sblock.fs_maxsymlinklen = ((UFS_NDADDR + UFS_NIADDR) * sizeof (ufs2_daddr_t)); } sblock.fs_sblkno = roundup(howmany(sblock.fs_sblockloc + SBLOCKSIZE, sblock.fs_fsize), sblock.fs_frag); sblock.fs_cblkno = (daddr_t)(sblock.fs_sblkno + roundup(howmany(SBLOCKSIZE, sblock.fs_fsize), sblock.fs_frag)); sblock.fs_iblkno = sblock.fs_cblkno + sblock.fs_frag; - sblock.fs_maxfilesize = sblock.fs_bsize * NDADDR - 1; - for (sizepb = sblock.fs_bsize, i = 0; i < NIADDR; i++) { + sblock.fs_maxfilesize = sblock.fs_bsize * UFS_NDADDR - 1; + for (sizepb = sblock.fs_bsize, i = 0; i < UFS_NIADDR; i++) { sizepb *= NINDIR(&sblock); sblock.fs_maxfilesize += sizepb; } /* * Calculate the number of blocks to put into each cylinder group. * * This algorithm selects the number of blocks per cylinder * group. The first goal is to have at least enough data blocks * in each cylinder group to meet the density requirement. Once * this goal is achieved we try to expand to have at least * 1 cylinder group. Once this goal is achieved, we pack as * many blocks into each cylinder group map as will fit. * * We start by calculating the smallest number of blocks that we * can put into each cylinder group. If this is too big, we reduce * the density until it fits. */ origdensity = density; for (;;) { fragsperinode = MAX(numfrags(&sblock, density), 1); minfpg = fragsperinode * INOPB(&sblock); if (minfpg > sblock.fs_size) minfpg = sblock.fs_size; sblock.fs_ipg = INOPB(&sblock); sblock.fs_fpg = roundup(sblock.fs_iblkno + sblock.fs_ipg / INOPF(&sblock), sblock.fs_frag); if (sblock.fs_fpg < minfpg) sblock.fs_fpg = minfpg; sblock.fs_ipg = roundup(howmany(sblock.fs_fpg, fragsperinode), INOPB(&sblock)); sblock.fs_fpg = roundup(sblock.fs_iblkno + sblock.fs_ipg / INOPF(&sblock), sblock.fs_frag); if (sblock.fs_fpg < minfpg) sblock.fs_fpg = minfpg; sblock.fs_ipg = roundup(howmany(sblock.fs_fpg, fragsperinode), INOPB(&sblock)); if (CGSIZE(&sblock) < (unsigned long)sblock.fs_bsize) break; density -= sblock.fs_fsize; } if (density != origdensity) printf("density reduced from %d to %d\n", origdensity, density); if (maxblkspercg <= 0 || maxblkspercg >= fssize) maxblkspercg = fssize - 1; /* * Start packing more blocks into the cylinder group until * it cannot grow any larger, the number of cylinder groups * drops below 1, or we reach the size requested. */ for ( ; sblock.fs_fpg < maxblkspercg; sblock.fs_fpg += sblock.fs_frag) { sblock.fs_ipg = roundup(howmany(sblock.fs_fpg, fragsperinode), INOPB(&sblock)); if (sblock.fs_size / sblock.fs_fpg < 1) break; if (CGSIZE(&sblock) < (unsigned long)sblock.fs_bsize) continue; if (CGSIZE(&sblock) == (unsigned long)sblock.fs_bsize) break; sblock.fs_fpg -= sblock.fs_frag; sblock.fs_ipg = roundup(howmany(sblock.fs_fpg, fragsperinode), INOPB(&sblock)); break; } /* * Check to be sure that the last cylinder group has enough blocks * to be viable. If it is too small, reduce the number of blocks * per cylinder group which will have the effect of moving more * blocks into the last cylinder group. */ optimalfpg = sblock.fs_fpg; for (;;) { sblock.fs_ncg = howmany(sblock.fs_size, sblock.fs_fpg); lastminfpg = roundup(sblock.fs_iblkno + sblock.fs_ipg / INOPF(&sblock), sblock.fs_frag); if (sblock.fs_size < lastminfpg) { printf("Filesystem size %lld < minimum size of %d\n", (long long)sblock.fs_size, lastminfpg); exit(28); } if (sblock.fs_size % sblock.fs_fpg >= lastminfpg || sblock.fs_size % sblock.fs_fpg == 0) break; sblock.fs_fpg -= sblock.fs_frag; sblock.fs_ipg = roundup(howmany(sblock.fs_fpg, fragsperinode), INOPB(&sblock)); } if (optimalfpg != sblock.fs_fpg) printf("Reduced frags per cylinder group from %d to %d %s\n", optimalfpg, sblock.fs_fpg, "to enlarge last cyl group"); sblock.fs_cgsize = fragroundup(&sblock, CGSIZE(&sblock)); sblock.fs_dblkno = sblock.fs_iblkno + sblock.fs_ipg / INOPF(&sblock); if (Oflag <= 1) { sblock.fs_old_spc = sblock.fs_fpg * sblock.fs_old_nspf; sblock.fs_old_nsect = sblock.fs_old_spc; sblock.fs_old_npsect = sblock.fs_old_spc; sblock.fs_old_ncyl = sblock.fs_ncg; } /* * fill in remaining fields of the super block */ sblock.fs_csaddr = cgdmin(&sblock, 0); sblock.fs_cssize = fragroundup(&sblock, sblock.fs_ncg * sizeof(struct csum)); /* * Setup memory for temporary in-core cylgroup summaries. * Cribbed from ffs_mountfs(). */ size = sblock.fs_cssize; blks = howmany(size, sblock.fs_fsize); if (sblock.fs_contigsumsize > 0) size += sblock.fs_ncg * sizeof(int32_t); if ((space = (char *)calloc(1, size)) == NULL) err(1, "memory allocation error for cg summaries"); sblock.fs_csp = space; space = (char *)space + sblock.fs_cssize; if (sblock.fs_contigsumsize > 0) { int32_t *lp; sblock.fs_maxcluster = lp = space; for (i = 0; i < sblock.fs_ncg; i++) *lp++ = sblock.fs_contigsumsize; } sblock.fs_sbsize = fragroundup(&sblock, sizeof(struct fs)); if (sblock.fs_sbsize > SBLOCKSIZE) sblock.fs_sbsize = SBLOCKSIZE; sblock.fs_minfree = minfree; sblock.fs_maxcontig = maxcontig; sblock.fs_maxbpg = maxbpg; sblock.fs_optim = opt; sblock.fs_cgrotor = 0; sblock.fs_pendingblocks = 0; sblock.fs_pendinginodes = 0; sblock.fs_cstotal.cs_ndir = 0; sblock.fs_cstotal.cs_nbfree = 0; sblock.fs_cstotal.cs_nifree = 0; sblock.fs_cstotal.cs_nffree = 0; sblock.fs_fmod = 0; sblock.fs_ronly = 0; sblock.fs_state = 0; sblock.fs_clean = FS_ISCLEAN; sblock.fs_ronly = 0; sblock.fs_id[0] = tstamp; sblock.fs_id[1] = random(); sblock.fs_fsmnt[0] = '\0'; csfrags = howmany(sblock.fs_cssize, sblock.fs_fsize); sblock.fs_dsize = sblock.fs_size - sblock.fs_sblkno - sblock.fs_ncg * (sblock.fs_dblkno - sblock.fs_sblkno); sblock.fs_cstotal.cs_nbfree = fragstoblks(&sblock, sblock.fs_dsize) - howmany(csfrags, sblock.fs_frag); sblock.fs_cstotal.cs_nffree = fragnum(&sblock, sblock.fs_size) + (fragnum(&sblock, csfrags) > 0 ? sblock.fs_frag - fragnum(&sblock, csfrags) : 0); - sblock.fs_cstotal.cs_nifree = sblock.fs_ncg * sblock.fs_ipg - ROOTINO; + sblock.fs_cstotal.cs_nifree = + sblock.fs_ncg * sblock.fs_ipg - UFS_ROOTINO; sblock.fs_cstotal.cs_ndir = 0; sblock.fs_dsize -= csfrags; sblock.fs_time = tstamp; if (Oflag <= 1) { sblock.fs_old_time = tstamp; sblock.fs_old_dsize = sblock.fs_dsize; sblock.fs_old_csaddr = sblock.fs_csaddr; sblock.fs_old_cstotal.cs_ndir = sblock.fs_cstotal.cs_ndir; sblock.fs_old_cstotal.cs_nbfree = sblock.fs_cstotal.cs_nbfree; sblock.fs_old_cstotal.cs_nifree = sblock.fs_cstotal.cs_nifree; sblock.fs_old_cstotal.cs_nffree = sblock.fs_cstotal.cs_nffree; } /* * Dump out summary information about file system. */ #define B2MBFACTOR (1 / (1024.0 * 1024.0)) printf("%s: %.1fMB (%lld sectors) block size %d, " "fragment size %d\n", fsys, (float)sblock.fs_size * sblock.fs_fsize * B2MBFACTOR, (long long)fsbtodb(&sblock, sblock.fs_size), sblock.fs_bsize, sblock.fs_fsize); printf("\tusing %d cylinder groups of %.2fMB, %d blks, " "%d inodes.\n", sblock.fs_ncg, (float)sblock.fs_fpg * sblock.fs_fsize * B2MBFACTOR, sblock.fs_fpg / sblock.fs_frag, sblock.fs_ipg); #undef B2MBFACTOR /* * Now determine how wide each column will be, and calculate how * many columns will fit in a 76 char line. 76 is the width of the * subwindows in sysinst. */ printcolwidth = count_digits( fsbtodb(&sblock, cgsblock(&sblock, sblock.fs_ncg -1))); nprintcols = 76 / (printcolwidth + 2); /* * allocate space for superblock, cylinder group map, and * two sets of inode blocks. */ if (sblock.fs_bsize < SBLOCKSIZE) iobufsize = SBLOCKSIZE + 3 * sblock.fs_bsize; else iobufsize = 4 * sblock.fs_bsize; if ((iobuf = malloc(iobufsize)) == NULL) { printf("Cannot allocate I/O buffer\n"); exit(38); } memset(iobuf, 0, iobufsize); /* * Make a copy of the superblock into the buffer that we will be * writing out in each cylinder group. */ memcpy(writebuf, &sblock, sbsize); if (fsopts->needswap) ffs_sb_swap(&sblock, (struct fs*)writebuf); memcpy(iobuf, writebuf, SBLOCKSIZE); printf("super-block backups (for fsck -b #) at:"); for (cylno = 0; cylno < sblock.fs_ncg; cylno++) { initcg(cylno, tstamp, fsopts); if (cylno % nprintcols == 0) printf("\n"); printf(" %*lld,", printcolwidth, (long long)fsbtodb(&sblock, cgsblock(&sblock, cylno))); fflush(stdout); } printf("\n"); /* * Now construct the initial file system, * then write out the super-block. */ sblock.fs_time = tstamp; if (Oflag <= 1) { sblock.fs_old_cstotal.cs_ndir = sblock.fs_cstotal.cs_ndir; sblock.fs_old_cstotal.cs_nbfree = sblock.fs_cstotal.cs_nbfree; sblock.fs_old_cstotal.cs_nifree = sblock.fs_cstotal.cs_nifree; sblock.fs_old_cstotal.cs_nffree = sblock.fs_cstotal.cs_nffree; } if (fsopts->needswap) sblock.fs_flags |= FS_SWAPPED; ffs_write_superblock(&sblock, fsopts); return (&sblock); } /* * Write out the superblock and its duplicates, * and the cylinder group summaries */ void ffs_write_superblock(struct fs *fs, const fsinfo_t *fsopts) { int cylno, size, blks, i, saveflag; void *space; char *wrbuf; saveflag = fs->fs_flags & FS_INTERNAL; fs->fs_flags &= ~FS_INTERNAL; memcpy(writebuf, &sblock, sbsize); if (fsopts->needswap) ffs_sb_swap(fs, (struct fs*)writebuf); ffs_wtfs(fs->fs_sblockloc / sectorsize, sbsize, writebuf, fsopts); /* Write out the duplicate super blocks */ for (cylno = 0; cylno < fs->fs_ncg; cylno++) ffs_wtfs(fsbtodb(fs, cgsblock(fs, cylno)), sbsize, writebuf, fsopts); /* Write out the cylinder group summaries */ size = fs->fs_cssize; blks = howmany(size, fs->fs_fsize); space = (void *)fs->fs_csp; if ((wrbuf = malloc(size)) == NULL) err(1, "ffs_write_superblock: malloc %d", size); for (i = 0; i < blks; i+= fs->fs_frag) { size = fs->fs_bsize; if (i + fs->fs_frag > blks) size = (blks - i) * fs->fs_fsize; if (fsopts->needswap) ffs_csum_swap((struct csum *)space, (struct csum *)wrbuf, size); else memcpy(wrbuf, space, (u_int)size); ffs_wtfs(fsbtodb(fs, fs->fs_csaddr + i), size, wrbuf, fsopts); space = (char *)space + size; } free(wrbuf); fs->fs_flags |= saveflag; } /* * Initialize a cylinder group. */ static void initcg(int cylno, time_t utime, const fsinfo_t *fsopts) { daddr_t cbase, dmax; int32_t i, j, d, dlower, dupper, blkno; struct ufs1_dinode *dp1; struct ufs2_dinode *dp2; int start; /* * Determine block bounds for cylinder group. * Allow space for super block summary information in first * cylinder group. */ cbase = cgbase(&sblock, cylno); dmax = cbase + sblock.fs_fpg; if (dmax > sblock.fs_size) dmax = sblock.fs_size; dlower = cgsblock(&sblock, cylno) - cbase; dupper = cgdmin(&sblock, cylno) - cbase; if (cylno == 0) dupper += howmany(sblock.fs_cssize, sblock.fs_fsize); memset(&acg, 0, sblock.fs_cgsize); acg.cg_time = utime; acg.cg_magic = CG_MAGIC; acg.cg_cgx = cylno; acg.cg_niblk = sblock.fs_ipg; acg.cg_initediblk = MIN(sblock.fs_ipg, 2 * INOPB(&sblock)); acg.cg_ndblk = dmax - cbase; if (sblock.fs_contigsumsize > 0) acg.cg_nclusterblks = acg.cg_ndblk >> sblock.fs_fragshift; start = &acg.cg_space[0] - (u_char *)(&acg.cg_firstfield); if (Oflag == 2) { acg.cg_iusedoff = start; } else { if (cylno == sblock.fs_ncg - 1) acg.cg_old_ncyl = howmany(acg.cg_ndblk, sblock.fs_fpg / sblock.fs_old_cpg); else acg.cg_old_ncyl = sblock.fs_old_cpg; acg.cg_old_time = acg.cg_time; acg.cg_time = 0; acg.cg_old_niblk = acg.cg_niblk; acg.cg_niblk = 0; acg.cg_initediblk = 0; acg.cg_old_btotoff = start; acg.cg_old_boff = acg.cg_old_btotoff + sblock.fs_old_cpg * sizeof(int32_t); acg.cg_iusedoff = acg.cg_old_boff + sblock.fs_old_cpg * sizeof(u_int16_t); } acg.cg_freeoff = acg.cg_iusedoff + howmany(sblock.fs_ipg, CHAR_BIT); if (sblock.fs_contigsumsize <= 0) { acg.cg_nextfreeoff = acg.cg_freeoff + howmany(sblock.fs_fpg, CHAR_BIT); } else { acg.cg_clustersumoff = acg.cg_freeoff + howmany(sblock.fs_fpg, CHAR_BIT) - sizeof(int32_t); acg.cg_clustersumoff = roundup(acg.cg_clustersumoff, sizeof(int32_t)); acg.cg_clusteroff = acg.cg_clustersumoff + (sblock.fs_contigsumsize + 1) * sizeof(int32_t); acg.cg_nextfreeoff = acg.cg_clusteroff + howmany(fragstoblks(&sblock, sblock.fs_fpg), CHAR_BIT); } if (acg.cg_nextfreeoff > sblock.fs_cgsize) { printf("Panic: cylinder group too big\n"); exit(37); } acg.cg_cs.cs_nifree += sblock.fs_ipg; if (cylno == 0) - for (i = 0; i < ROOTINO; i++) { + for (i = 0; i < UFS_ROOTINO; i++) { setbit(cg_inosused_swap(&acg, 0), i); acg.cg_cs.cs_nifree--; } if (cylno > 0) { /* * In cylno 0, beginning space is reserved * for boot and super blocks. */ for (d = 0, blkno = 0; d < dlower;) { ffs_setblock(&sblock, cg_blksfree_swap(&acg, 0), blkno); if (sblock.fs_contigsumsize > 0) setbit(cg_clustersfree_swap(&acg, 0), blkno); acg.cg_cs.cs_nbfree++; d += sblock.fs_frag; blkno++; } } if ((i = (dupper & (sblock.fs_frag - 1))) != 0) { acg.cg_frsum[sblock.fs_frag - i]++; for (d = dupper + sblock.fs_frag - i; dupper < d; dupper++) { setbit(cg_blksfree_swap(&acg, 0), dupper); acg.cg_cs.cs_nffree++; } } for (d = dupper, blkno = dupper >> sblock.fs_fragshift; d + sblock.fs_frag <= acg.cg_ndblk; ) { ffs_setblock(&sblock, cg_blksfree_swap(&acg, 0), blkno); if (sblock.fs_contigsumsize > 0) setbit(cg_clustersfree_swap(&acg, 0), blkno); acg.cg_cs.cs_nbfree++; d += sblock.fs_frag; blkno++; } if (d < acg.cg_ndblk) { acg.cg_frsum[acg.cg_ndblk - d]++; for (; d < acg.cg_ndblk; d++) { setbit(cg_blksfree_swap(&acg, 0), d); acg.cg_cs.cs_nffree++; } } if (sblock.fs_contigsumsize > 0) { int32_t *sump = cg_clustersum_swap(&acg, 0); u_char *mapp = cg_clustersfree_swap(&acg, 0); int map = *mapp++; int bit = 1; int run = 0; for (i = 0; i < acg.cg_nclusterblks; i++) { if ((map & bit) != 0) { run++; } else if (run != 0) { if (run > sblock.fs_contigsumsize) run = sblock.fs_contigsumsize; sump[run]++; run = 0; } if ((i & (CHAR_BIT - 1)) != (CHAR_BIT - 1)) { bit <<= 1; } else { map = *mapp++; bit = 1; } } if (run != 0) { if (run > sblock.fs_contigsumsize) run = sblock.fs_contigsumsize; sump[run]++; } } sblock.fs_cs(&sblock, cylno) = acg.cg_cs; /* * Write out the duplicate super block, the cylinder group map * and two blocks worth of inodes in a single write. */ start = MAX(sblock.fs_bsize, SBLOCKSIZE); memcpy(&iobuf[start], &acg, sblock.fs_cgsize); if (fsopts->needswap) ffs_cg_swap(&acg, (struct cg*)&iobuf[start], &sblock); start += sblock.fs_bsize; dp1 = (struct ufs1_dinode *)(&iobuf[start]); dp2 = (struct ufs2_dinode *)(&iobuf[start]); for (i = 0; i < acg.cg_initediblk; i++) { if (sblock.fs_magic == FS_UFS1_MAGIC) { /* No need to swap, it'll stay random */ dp1->di_gen = random(); dp1++; } else { dp2->di_gen = random(); dp2++; } } ffs_wtfs(fsbtodb(&sblock, cgsblock(&sblock, cylno)), iobufsize, iobuf, fsopts); /* * For the old file system, we have to initialize all the inodes. */ if (Oflag <= 1) { for (i = 2 * sblock.fs_frag; i < sblock.fs_ipg / INOPF(&sblock); i += sblock.fs_frag) { dp1 = (struct ufs1_dinode *)(&iobuf[start]); for (j = 0; j < INOPB(&sblock); j++) { dp1->di_gen = random(); dp1++; } ffs_wtfs(fsbtodb(&sblock, cgimin(&sblock, cylno) + i), sblock.fs_bsize, &iobuf[start], fsopts); } } } /* * read a block from the file system */ void ffs_rdfs(daddr_t bno, int size, void *bf, const fsinfo_t *fsopts) { int n; off_t offset; offset = bno; offset *= fsopts->sectorsize; if (lseek(fsopts->fd, offset, SEEK_SET) < 0) err(1, "ffs_rdfs: seek error for sector %lld: %s\n", (long long)bno, strerror(errno)); n = read(fsopts->fd, bf, size); if (n == -1) { abort(); err(1, "ffs_rdfs: read error bno %lld size %d", (long long)bno, size); } else if (n != size) errx(1, "ffs_rdfs: read error for sector %lld: %s\n", (long long)bno, strerror(errno)); } /* * write a block to the file system */ void ffs_wtfs(daddr_t bno, int size, void *bf, const fsinfo_t *fsopts) { int n; off_t offset; offset = bno; offset *= fsopts->sectorsize; if (lseek(fsopts->fd, offset, SEEK_SET) < 0) err(1, "wtfs: seek error for sector %lld: %s\n", (long long)bno, strerror(errno)); n = write(fsopts->fd, bf, size); if (n == -1) err(1, "wtfs: write error for sector %lld: %s\n", (long long)bno, strerror(errno)); else if (n != size) errx(1, "wtfs: write error for sector %lld: %s\n", (long long)bno, strerror(errno)); } /* Determine how many digits are needed to print a given integer */ static int count_digits(int num) { int ndig; for(ndig = 1; num > 9; num /=10, ndig++); return (ndig); } static int ilog2(int val) { u_int n; for (n = 0; n < sizeof(n) * CHAR_BIT; n++) if (1 << n == val) return (n); errx(1, "ilog2: %d is not a power of 2\n", val); } Index: head/usr.sbin/makefs/ffs/ufs_bmap.c =================================================================== --- head/usr.sbin/makefs/ffs/ufs_bmap.c (revision 313779) +++ head/usr.sbin/makefs/ffs/ufs_bmap.c (revision 313780) @@ -1,140 +1,140 @@ /* $NetBSD: ufs_bmap.c,v 1.14 2004/06/20 22:20:18 jmc Exp $ */ /* From: NetBSD: ufs_bmap.c,v 1.14 2001/11/08 05:00:51 chs Exp */ /* * Copyright (c) 1989, 1991, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ufs_bmap.c 8.8 (Berkeley) 8/11/95 */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include "makefs.h" #include #include #include "ffs/ufs_bswap.h" #include "ffs/ufs_inode.h" #include "ffs/ffs_extern.h" /* * Create an array of logical block number/offset pairs which represent the * path of indirect blocks required to access a data block. The first "pair" * contains the logical block number of the appropriate single, double or * triple indirect block and the offset into the inode indirect block array. * Note, the logical block number of the inode single/double/triple indirect * block appears twice in the array, once with the offset into the i_ffs_ib and * once with the offset into the page itself. */ int ufs_getlbns(struct inode *ip, daddr_t bn, struct indir *ap, int *nump) { daddr_t metalbn, realbn; int64_t blockcnt; int lbc; int i, numlevels, off; u_long lognindir; lognindir = ffs(NINDIR(ip->i_fs)) - 1; if (nump) *nump = 0; numlevels = 0; realbn = bn; if ((long)bn < 0) bn = -(long)bn; - assert (bn >= NDADDR); + assert (bn >= UFS_NDADDR); /* * Determine the number of levels of indirection. After this loop * is done, blockcnt indicates the number of data blocks possible - * at the given level of indirection, and NIADDR - i is the number + * at the given level of indirection, and UFS_NIADDR - i is the number * of levels of indirection needed to locate the requested block. */ - bn -= NDADDR; - for (lbc = 0, i = NIADDR;; i--, bn -= blockcnt) { + bn -= UFS_NDADDR; + for (lbc = 0, i = UFS_NIADDR;; i--, bn -= blockcnt) { if (i == 0) return (EFBIG); lbc += lognindir; blockcnt = (int64_t)1 << lbc; if (bn < blockcnt) break; } /* Calculate the address of the first meta-block. */ if (realbn >= 0) - metalbn = -(realbn - bn + NIADDR - i); + metalbn = -(realbn - bn + UFS_NIADDR - i); else - metalbn = -(-realbn - bn + NIADDR - i); + metalbn = -(-realbn - bn + UFS_NIADDR - i); /* * At each iteration, off is the offset into the bap array which is * an array of disk addresses at the current level of indirection. * The logical block number and the offset in that block are stored * into the argument array. */ ap->in_lbn = metalbn; - ap->in_off = off = NIADDR - i; + ap->in_off = off = UFS_NIADDR - i; ap++; - for (++numlevels; i <= NIADDR; i++) { + for (++numlevels; i <= UFS_NIADDR; i++) { /* If searching for a meta-data block, quit when found. */ if (metalbn == realbn) break; lbc -= lognindir; blockcnt = (int64_t)1 << lbc; off = (bn >> lbc) & (NINDIR(ip->i_fs) - 1); ++numlevels; ap->in_lbn = metalbn; ap->in_off = off; ++ap; metalbn -= -1 + (off << lbc); } if (nump) *nump = numlevels; return (0); } Index: head/usr.sbin/makefs/ffs.c =================================================================== --- head/usr.sbin/makefs/ffs.c (revision 313779) +++ head/usr.sbin/makefs/ffs.c (revision 313780) @@ -1,1181 +1,1181 @@ /* $NetBSD: ffs.c,v 1.45 2011/10/09 22:49:26 christos Exp $ */ /* * Copyright (c) 2001 Wasabi Systems, Inc. * All rights reserved. * * Written by Luke Mewburn for Wasabi Systems, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed for the NetBSD Project by * Wasabi Systems, Inc. * 4. The name of Wasabi Systems, Inc. may not be used to endorse * or promote products derived from this software without specific prior * written permission. * * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL WASABI SYSTEMS, INC * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * Copyright (c) 1982, 1986, 1989, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ffs_alloc.c 8.19 (Berkeley) 7/13/95 */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include "makefs.h" #include "ffs.h" #if HAVE_STRUCT_STATVFS_F_IOSIZE && HAVE_FSTATVFS #include #endif #include #include #include #include "ffs/ufs_bswap.h" #include "ffs/ufs_inode.h" #include "ffs/newfs_extern.h" #include "ffs/ffs_extern.h" #undef DIP #define DIP(dp, field) \ ((ffs_opts->version == 1) ? \ (dp)->ffs1_din.di_##field : (dp)->ffs2_din.di_##field) /* * Various file system defaults (cribbed from newfs(8)). */ #define DFL_FRAGSIZE 1024 /* fragment size */ #define DFL_BLKSIZE 8192 /* block size */ #define DFL_SECSIZE 512 /* sector size */ #define DFL_CYLSPERGROUP 65536 /* cylinders per group */ #define DFL_FRAGSPERINODE 4 /* fragments per inode */ #define DFL_ROTDELAY 0 /* rotational delay */ #define DFL_NRPOS 1 /* rotational positions */ #define DFL_RPM 3600 /* rpm of disk */ #define DFL_NSECTORS 64 /* # of sectors */ #define DFL_NTRACKS 16 /* # of tracks */ typedef struct { u_char *buf; /* buf for directory */ doff_t size; /* full size of buf */ doff_t cur; /* offset of current entry */ } dirbuf_t; static int ffs_create_image(const char *, fsinfo_t *); static void ffs_dump_fsinfo(fsinfo_t *); static void ffs_dump_dirbuf(dirbuf_t *, const char *, int); static void ffs_make_dirbuf(dirbuf_t *, const char *, fsnode *, int); static int ffs_populate_dir(const char *, fsnode *, fsinfo_t *); static void ffs_size_dir(fsnode *, fsinfo_t *); static void ffs_validate(const char *, fsnode *, fsinfo_t *); static void ffs_write_file(union dinode *, uint32_t, void *, fsinfo_t *); static void ffs_write_inode(union dinode *, uint32_t, const fsinfo_t *); static void *ffs_build_dinode1(struct ufs1_dinode *, dirbuf_t *, fsnode *, fsnode *, fsinfo_t *); static void *ffs_build_dinode2(struct ufs2_dinode *, dirbuf_t *, fsnode *, fsnode *, fsinfo_t *); int sectorsize; /* XXX: for buf.c::getblk() */ /* publicly visible functions */ void ffs_prep_opts(fsinfo_t *fsopts) { ffs_opt_t *ffs_opts; if ((ffs_opts = calloc(1, sizeof(ffs_opt_t))) == NULL) err(1, "Allocating memory for ffs_options"); fsopts->fs_specific = ffs_opts; ffs_opts->bsize= -1; ffs_opts->fsize= -1; ffs_opts->cpg= -1; ffs_opts->density= -1; ffs_opts->minfree= -1; ffs_opts->optimization= -1; ffs_opts->maxcontig= -1; ffs_opts->maxbpg= -1; ffs_opts->avgfilesize= -1; ffs_opts->avgfpdir= -1; ffs_opts->version = 1; } void ffs_cleanup_opts(fsinfo_t *fsopts) { if (fsopts->fs_specific) free(fsopts->fs_specific); } int ffs_parse_opts(const char *option, fsinfo_t *fsopts) { ffs_opt_t *ffs_opts = fsopts->fs_specific; option_t ffs_options[] = { { "bsize", &ffs_opts->bsize, 1, INT_MAX, "block size" }, { "fsize", &ffs_opts->fsize, 1, INT_MAX, "fragment size" }, { "density", &ffs_opts->density, 1, INT_MAX, "bytes per inode" }, { "minfree", &ffs_opts->minfree, 0, 99, "minfree" }, { "maxbpg", &ffs_opts->maxbpg, 1, INT_MAX, "max blocks per file in a cg" }, { "avgfilesize", &ffs_opts->avgfilesize,1, INT_MAX, "expected average file size" }, { "avgfpdir", &ffs_opts->avgfpdir, 1, INT_MAX, "expected # of files per directory" }, { "extent", &ffs_opts->maxbsize, 1, INT_MAX, "maximum # extent size" }, { "maxbpcg", &ffs_opts->maxblkspercg,1, INT_MAX, "max # of blocks per group" }, { "version", &ffs_opts->version, 1, 2, "UFS version" }, { .name = NULL } }; char *var, *val; int rv; assert(option != NULL); assert(fsopts != NULL); assert(ffs_opts != NULL); if (debug & DEBUG_FS_PARSE_OPTS) printf("ffs_parse_opts: got `%s'\n", option); if ((var = strdup(option)) == NULL) err(1, "Allocating memory for copy of option string"); rv = 0; if ((val = strchr(var, '=')) == NULL) { warnx("Option `%s' doesn't contain a value", var); goto leave_ffs_parse_opts; } *val++ = '\0'; if (strcmp(var, "optimization") == 0) { if (strcmp(val, "time") == 0) { ffs_opts->optimization = FS_OPTTIME; } else if (strcmp(val, "space") == 0) { ffs_opts->optimization = FS_OPTSPACE; } else { warnx("Invalid optimization `%s'", val); goto leave_ffs_parse_opts; } rv = 1; } else if (strcmp(var, "label") == 0) { strlcpy(ffs_opts->label, val, sizeof(ffs_opts->label)); rv = 1; } else rv = set_option(ffs_options, var, val); leave_ffs_parse_opts: if (var) free(var); return (rv); } void ffs_makefs(const char *image, const char *dir, fsnode *root, fsinfo_t *fsopts) { struct fs *superblock; struct timeval start; assert(image != NULL); assert(dir != NULL); assert(root != NULL); assert(fsopts != NULL); if (debug & DEBUG_FS_MAKEFS) printf("ffs_makefs: image %s directory %s root %p\n", image, dir, root); /* validate tree and options */ TIMER_START(start); ffs_validate(dir, root, fsopts); TIMER_RESULTS(start, "ffs_validate"); printf("Calculated size of `%s': %lld bytes, %lld inodes\n", image, (long long)fsopts->size, (long long)fsopts->inodes); /* create image */ TIMER_START(start); if (ffs_create_image(image, fsopts) == -1) errx(1, "Image file `%s' not created.", image); TIMER_RESULTS(start, "ffs_create_image"); - fsopts->curinode = ROOTINO; + fsopts->curinode = UFS_ROOTINO; if (debug & DEBUG_FS_MAKEFS) putchar('\n'); /* populate image */ printf("Populating `%s'\n", image); TIMER_START(start); if (! ffs_populate_dir(dir, root, fsopts)) errx(1, "Image file `%s' not populated.", image); TIMER_RESULTS(start, "ffs_populate_dir"); /* ensure no outstanding buffers remain */ if (debug & DEBUG_FS_MAKEFS) bcleanup(); /* update various superblock parameters */ superblock = fsopts->superblock; superblock->fs_fmod = 0; superblock->fs_old_cstotal.cs_ndir = superblock->fs_cstotal.cs_ndir; superblock->fs_old_cstotal.cs_nbfree = superblock->fs_cstotal.cs_nbfree; superblock->fs_old_cstotal.cs_nifree = superblock->fs_cstotal.cs_nifree; superblock->fs_old_cstotal.cs_nffree = superblock->fs_cstotal.cs_nffree; /* write out superblock; image is now complete */ ffs_write_superblock(fsopts->superblock, fsopts); if (close(fsopts->fd) == -1) err(1, "Closing `%s'", image); fsopts->fd = -1; printf("Image `%s' complete\n", image); } /* end of public functions */ static void ffs_validate(const char *dir, fsnode *root, fsinfo_t *fsopts) { int32_t ncg = 1; #if notyet int32_t spc, nspf, ncyl, fssize; #endif ffs_opt_t *ffs_opts = fsopts->fs_specific; assert(dir != NULL); assert(root != NULL); assert(fsopts != NULL); assert(ffs_opts != NULL); if (debug & DEBUG_FS_VALIDATE) { printf("ffs_validate: before defaults set:\n"); ffs_dump_fsinfo(fsopts); } /* set FFS defaults */ if (fsopts->sectorsize == -1) fsopts->sectorsize = DFL_SECSIZE; if (ffs_opts->fsize == -1) ffs_opts->fsize = MAX(DFL_FRAGSIZE, fsopts->sectorsize); if (ffs_opts->bsize == -1) ffs_opts->bsize = MIN(DFL_BLKSIZE, 8 * ffs_opts->fsize); if (ffs_opts->cpg == -1) ffs_opts->cpg = DFL_CYLSPERGROUP; else ffs_opts->cpgflg = 1; /* fsopts->density is set below */ if (ffs_opts->nsectors == -1) ffs_opts->nsectors = DFL_NSECTORS; if (ffs_opts->minfree == -1) ffs_opts->minfree = MINFREE; if (ffs_opts->optimization == -1) ffs_opts->optimization = DEFAULTOPT; if (ffs_opts->maxcontig == -1) ffs_opts->maxcontig = MAX(1, MIN(MAXPHYS, FFS_MAXBSIZE) / ffs_opts->bsize); /* XXX ondisk32 */ if (ffs_opts->maxbpg == -1) ffs_opts->maxbpg = ffs_opts->bsize / sizeof(int32_t); if (ffs_opts->avgfilesize == -1) ffs_opts->avgfilesize = AVFILESIZ; if (ffs_opts->avgfpdir == -1) ffs_opts->avgfpdir = AFPDIR; if (fsopts->maxsize > 0 && roundup(fsopts->minsize, ffs_opts->bsize) > fsopts->maxsize) errx(1, "`%s' minsize of %lld rounded up to ffs bsize of %d " "exceeds maxsize %lld. Lower bsize, or round the minimum " "and maximum sizes to bsize.", dir, (long long)fsopts->minsize, ffs_opts->bsize, (long long)fsopts->maxsize); /* calculate size of tree */ ffs_size_dir(root, fsopts); - fsopts->inodes += ROOTINO; /* include first two inodes */ + fsopts->inodes += UFS_ROOTINO; /* include first two inodes */ if (debug & DEBUG_FS_VALIDATE) printf("ffs_validate: size of tree: %lld bytes, %lld inodes\n", (long long)fsopts->size, (long long)fsopts->inodes); /* add requested slop */ fsopts->size += fsopts->freeblocks; fsopts->inodes += fsopts->freefiles; if (fsopts->freefilepc > 0) fsopts->inodes = fsopts->inodes * (100 + fsopts->freefilepc) / 100; if (fsopts->freeblockpc > 0) fsopts->size = fsopts->size * (100 + fsopts->freeblockpc) / 100; /* add space needed for superblocks */ /* * The old SBOFF (SBLOCK_UFS1) is used here because makefs is * typically used for small filesystems where space matters. * XXX make this an option. */ fsopts->size += (SBLOCK_UFS1 + SBLOCKSIZE) * ncg; /* add space needed to store inodes, x3 for blockmaps, etc */ if (ffs_opts->version == 1) fsopts->size += ncg * DINODE1_SIZE * roundup(fsopts->inodes / ncg, ffs_opts->bsize / DINODE1_SIZE); else fsopts->size += ncg * DINODE2_SIZE * roundup(fsopts->inodes / ncg, ffs_opts->bsize / DINODE2_SIZE); /* add minfree */ if (ffs_opts->minfree > 0) fsopts->size = fsopts->size * (100 + ffs_opts->minfree) / 100; /* * XXX any other fs slop to add, such as csum's, bitmaps, etc ?? */ if (fsopts->size < fsopts->minsize) /* ensure meets minimum size */ fsopts->size = fsopts->minsize; /* round up to the next block */ fsopts->size = roundup(fsopts->size, ffs_opts->bsize); /* round up to requested block size, if any */ if (fsopts->roundup > 0) fsopts->size = roundup(fsopts->size, fsopts->roundup); /* calculate density if necessary */ if (ffs_opts->density == -1) ffs_opts->density = fsopts->size / fsopts->inodes + 1; if (debug & DEBUG_FS_VALIDATE) { printf("ffs_validate: after defaults set:\n"); ffs_dump_fsinfo(fsopts); printf("ffs_validate: dir %s; %lld bytes, %lld inodes\n", dir, (long long)fsopts->size, (long long)fsopts->inodes); } sectorsize = fsopts->sectorsize; /* XXX - see earlier */ /* now check calculated sizes vs requested sizes */ if (fsopts->maxsize > 0 && fsopts->size > fsopts->maxsize) { errx(1, "`%s' size of %lld is larger than the maxsize of %lld.", dir, (long long)fsopts->size, (long long)fsopts->maxsize); } } static void ffs_dump_fsinfo(fsinfo_t *f) { ffs_opt_t *fs = f->fs_specific; printf("fsopts at %p\n", f); printf("\tsize %lld, inodes %lld, curinode %u\n", (long long)f->size, (long long)f->inodes, f->curinode); printf("\tminsize %lld, maxsize %lld\n", (long long)f->minsize, (long long)f->maxsize); printf("\tfree files %lld, freefile %% %d\n", (long long)f->freefiles, f->freefilepc); printf("\tfree blocks %lld, freeblock %% %d\n", (long long)f->freeblocks, f->freeblockpc); printf("\tneedswap %d, sectorsize %d\n", f->needswap, f->sectorsize); printf("\tbsize %d, fsize %d, cpg %d, density %d\n", fs->bsize, fs->fsize, fs->cpg, fs->density); printf("\tnsectors %d, rpm %d, minfree %d\n", fs->nsectors, fs->rpm, fs->minfree); printf("\tmaxcontig %d, maxbpg %d\n", fs->maxcontig, fs->maxbpg); printf("\toptimization %s\n", fs->optimization == FS_OPTSPACE ? "space" : "time"); } static int ffs_create_image(const char *image, fsinfo_t *fsopts) { #if HAVE_STRUCT_STATVFS_F_IOSIZE && HAVE_FSTATVFS struct statvfs sfs; #endif struct fs *fs; char *buf; int i, bufsize; off_t bufrem; time_t tstamp; assert (image != NULL); assert (fsopts != NULL); /* create image */ if ((fsopts->fd = open(image, O_RDWR | O_CREAT | O_TRUNC, 0666)) == -1) { warn("Can't open `%s' for writing", image); return (-1); } /* zero image */ #if HAVE_STRUCT_STATVFS_F_IOSIZE && HAVE_FSTATVFS if (fstatvfs(fsopts->fd, &sfs) == -1) { #endif bufsize = 8192; #if HAVE_STRUCT_STATVFS_F_IOSIZE && HAVE_FSTATVFS warn("can't fstatvfs `%s', using default %d byte chunk", image, bufsize); } else bufsize = sfs.f_iosize; #endif bufrem = fsopts->size; if (fsopts->sparse) { if (ftruncate(fsopts->fd, bufrem) == -1) { warn("sparse option disabled.\n"); fsopts->sparse = 0; } } if (fsopts->sparse) { /* File truncated at bufrem. Remaining is 0 */ bufrem = 0; buf = NULL; } else { if (debug & DEBUG_FS_CREATE_IMAGE) printf("zero-ing image `%s', %lld sectors, " "using %d byte chunks\n", image, (long long)bufrem, bufsize); if ((buf = calloc(1, bufsize)) == NULL) { warn("Can't create buffer for sector"); return (-1); } } while (bufrem > 0) { i = write(fsopts->fd, buf, MIN(bufsize, bufrem)); if (i == -1) { warn("zeroing image, %lld bytes to go", (long long)bufrem); free(buf); return (-1); } bufrem -= i; } if (buf) free(buf); /* make the file system */ if (debug & DEBUG_FS_CREATE_IMAGE) printf("calling mkfs(\"%s\", ...)\n", image); if (stampst.st_ino != 0) tstamp = stampst.st_ctime; else tstamp = start_time.tv_sec; srandom(tstamp); fs = ffs_mkfs(image, fsopts, tstamp); fsopts->superblock = (void *)fs; if (debug & DEBUG_FS_CREATE_IMAGE) { time_t t; t = (time_t)((struct fs *)fsopts->superblock)->fs_time; printf("mkfs returned %p; fs_time %s", fsopts->superblock, ctime(&t)); printf("fs totals: nbfree %lld, nffree %lld, nifree %lld, ndir %lld\n", (long long)fs->fs_cstotal.cs_nbfree, (long long)fs->fs_cstotal.cs_nffree, (long long)fs->fs_cstotal.cs_nifree, (long long)fs->fs_cstotal.cs_ndir); } - if (fs->fs_cstotal.cs_nifree + ROOTINO < fsopts->inodes) { + if (fs->fs_cstotal.cs_nifree + UFS_ROOTINO < fsopts->inodes) { warnx( "Image file `%s' has %lld free inodes; %lld are required.", image, - (long long)(fs->fs_cstotal.cs_nifree + ROOTINO), + (long long)(fs->fs_cstotal.cs_nifree + UFS_ROOTINO), (long long)fsopts->inodes); return (-1); } return (fsopts->fd); } static void ffs_size_dir(fsnode *root, fsinfo_t *fsopts) { struct direct tmpdir; fsnode * node; int curdirsize, this; ffs_opt_t *ffs_opts = fsopts->fs_specific; /* node may be NULL (empty directory) */ assert(fsopts != NULL); assert(ffs_opts != NULL); if (debug & DEBUG_FS_SIZE_DIR) printf("ffs_size_dir: entry: bytes %lld inodes %lld\n", (long long)fsopts->size, (long long)fsopts->inodes); #define ADDDIRENT(e) do { \ tmpdir.d_namlen = strlen((e)); \ this = DIRSIZ_SWAP(0, &tmpdir, 0); \ if (debug & DEBUG_FS_SIZE_DIR_ADD_DIRENT) \ printf("ADDDIRENT: was: %s (%d) this %d cur %d\n", \ e, tmpdir.d_namlen, this, curdirsize); \ if (this + curdirsize > roundup(curdirsize, DIRBLKSIZ)) \ curdirsize = roundup(curdirsize, DIRBLKSIZ); \ curdirsize += this; \ if (debug & DEBUG_FS_SIZE_DIR_ADD_DIRENT) \ printf("ADDDIRENT: now: %s (%d) this %d cur %d\n", \ e, tmpdir.d_namlen, this, curdirsize); \ } while (0); /* * XXX this needs to take into account extra space consumed * by indirect blocks, etc. */ #define ADDSIZE(x) do { \ fsopts->size += roundup((x), ffs_opts->fsize); \ } while (0); curdirsize = 0; for (node = root; node != NULL; node = node->next) { ADDDIRENT(node->name); if (node == root) { /* we're at "." */ assert(strcmp(node->name, ".") == 0); ADDDIRENT(".."); } else if ((node->inode->flags & FI_SIZED) == 0) { /* don't count duplicate names */ node->inode->flags |= FI_SIZED; if (debug & DEBUG_FS_SIZE_DIR_NODE) printf("ffs_size_dir: `%s' size %lld\n", node->name, (long long)node->inode->st.st_size); fsopts->inodes++; if (node->type == S_IFREG) ADDSIZE(node->inode->st.st_size); if (node->type == S_IFLNK) { int slen; slen = strlen(node->symlink) + 1; if (slen >= (ffs_opts->version == 1 ? - MAXSYMLINKLEN_UFS1 : - MAXSYMLINKLEN_UFS2)) + UFS1_MAXSYMLINKLEN : + UFS2_MAXSYMLINKLEN)) ADDSIZE(slen); } } if (node->type == S_IFDIR) ffs_size_dir(node->child, fsopts); } ADDSIZE(curdirsize); if (debug & DEBUG_FS_SIZE_DIR) printf("ffs_size_dir: exit: size %lld inodes %lld\n", (long long)fsopts->size, (long long)fsopts->inodes); } static void * ffs_build_dinode1(struct ufs1_dinode *dinp, dirbuf_t *dbufp, fsnode *cur, fsnode *root, fsinfo_t *fsopts) { int slen; void *membuf; struct stat *st = stampst.st_ino != 0 ? &stampst : &cur->inode->st; memset(dinp, 0, sizeof(*dinp)); dinp->di_mode = cur->inode->st.st_mode; dinp->di_nlink = cur->inode->nlink; dinp->di_size = cur->inode->st.st_size; #if HAVE_STRUCT_STAT_ST_FLAGS dinp->di_flags = cur->inode->st.st_flags; #endif dinp->di_gen = random(); dinp->di_uid = cur->inode->st.st_uid; dinp->di_gid = cur->inode->st.st_gid; dinp->di_atime = st->st_atime; dinp->di_mtime = st->st_mtime; dinp->di_ctime = st->st_ctime; #if HAVE_STRUCT_STAT_ST_MTIMENSEC dinp->di_atimensec = st->st_atimensec; dinp->di_mtimensec = st->st_mtimensec; dinp->di_ctimensec = st->st_ctimensec; #endif /* not set: di_db, di_ib, di_blocks, di_spare */ membuf = NULL; if (cur == root) { /* "."; write dirbuf */ membuf = dbufp->buf; dinp->di_size = dbufp->size; } else if (S_ISBLK(cur->type) || S_ISCHR(cur->type)) { dinp->di_size = 0; /* a device */ dinp->di_rdev = ufs_rw32(cur->inode->st.st_rdev, fsopts->needswap); } else if (S_ISLNK(cur->type)) { /* symlink */ slen = strlen(cur->symlink); - if (slen < MAXSYMLINKLEN_UFS1) { /* short link */ + if (slen < UFS1_MAXSYMLINKLEN) { /* short link */ memcpy(dinp->di_db, cur->symlink, slen); } else membuf = cur->symlink; dinp->di_size = slen; } return membuf; } static void * ffs_build_dinode2(struct ufs2_dinode *dinp, dirbuf_t *dbufp, fsnode *cur, fsnode *root, fsinfo_t *fsopts) { int slen; void *membuf; struct stat *st = stampst.st_ino != 0 ? &stampst : &cur->inode->st; memset(dinp, 0, sizeof(*dinp)); dinp->di_mode = cur->inode->st.st_mode; dinp->di_nlink = cur->inode->nlink; dinp->di_size = cur->inode->st.st_size; #if HAVE_STRUCT_STAT_ST_FLAGS dinp->di_flags = cur->inode->st.st_flags; #endif dinp->di_gen = random(); dinp->di_uid = cur->inode->st.st_uid; dinp->di_gid = cur->inode->st.st_gid; dinp->di_atime = st->st_atime; dinp->di_mtime = st->st_mtime; dinp->di_ctime = st->st_ctime; #if HAVE_STRUCT_STAT_ST_MTIMENSEC dinp->di_atimensec = st->st_atimensec; dinp->di_mtimensec = st->st_mtimensec; dinp->di_ctimensec = st->st_ctimensec; #endif #if HAVE_STRUCT_STAT_BIRTHTIME dinp->di_birthtime = st->st_birthtime; dinp->di_birthnsec = st->st_birthtimensec; #endif /* not set: di_db, di_ib, di_blocks, di_spare */ membuf = NULL; if (cur == root) { /* "."; write dirbuf */ membuf = dbufp->buf; dinp->di_size = dbufp->size; } else if (S_ISBLK(cur->type) || S_ISCHR(cur->type)) { dinp->di_size = 0; /* a device */ dinp->di_rdev = ufs_rw64(cur->inode->st.st_rdev, fsopts->needswap); } else if (S_ISLNK(cur->type)) { /* symlink */ slen = strlen(cur->symlink); - if (slen < MAXSYMLINKLEN_UFS2) { /* short link */ + if (slen < UFS2_MAXSYMLINKLEN) { /* short link */ memcpy(dinp->di_db, cur->symlink, slen); } else membuf = cur->symlink; dinp->di_size = slen; } return membuf; } static int ffs_populate_dir(const char *dir, fsnode *root, fsinfo_t *fsopts) { fsnode *cur; dirbuf_t dirbuf; union dinode din; void *membuf; char path[MAXPATHLEN + 1]; ffs_opt_t *ffs_opts = fsopts->fs_specific; assert(dir != NULL); assert(root != NULL); assert(fsopts != NULL); assert(ffs_opts != NULL); (void)memset(&dirbuf, 0, sizeof(dirbuf)); if (debug & DEBUG_FS_POPULATE) printf("ffs_populate_dir: PASS 1 dir %s node %p\n", dir, root); /* * pass 1: allocate inode numbers, build directory `file' */ for (cur = root; cur != NULL; cur = cur->next) { if ((cur->inode->flags & FI_ALLOCATED) == 0) { cur->inode->flags |= FI_ALLOCATED; if (cur == root && cur->parent != NULL) cur->inode->ino = cur->parent->inode->ino; else { cur->inode->ino = fsopts->curinode; fsopts->curinode++; } } ffs_make_dirbuf(&dirbuf, cur->name, cur, fsopts->needswap); if (cur == root) { /* we're at "."; add ".." */ ffs_make_dirbuf(&dirbuf, "..", cur->parent == NULL ? cur : cur->parent->first, fsopts->needswap); root->inode->nlink++; /* count my parent's link */ } else if (cur->child != NULL) root->inode->nlink++; /* count my child's link */ /* * XXX possibly write file and long symlinks here, * ensuring that blocks get written before inodes? * otoh, this isn't a real filesystem, so who * cares about ordering? :-) */ } if (debug & DEBUG_FS_POPULATE_DIRBUF) ffs_dump_dirbuf(&dirbuf, dir, fsopts->needswap); /* * pass 2: write out dirbuf, then non-directories at this level */ if (debug & DEBUG_FS_POPULATE) printf("ffs_populate_dir: PASS 2 dir %s\n", dir); for (cur = root; cur != NULL; cur = cur->next) { if (cur->inode->flags & FI_WRITTEN) continue; /* skip hard-linked entries */ cur->inode->flags |= FI_WRITTEN; if (cur->contents == NULL) { if (snprintf(path, sizeof(path), "%s/%s/%s", cur->root, cur->path, cur->name) >= (int)sizeof(path)) errx(1, "Pathname too long."); } if (cur->child != NULL) continue; /* child creates own inode */ /* build on-disk inode */ if (ffs_opts->version == 1) membuf = ffs_build_dinode1(&din.ffs1_din, &dirbuf, cur, root, fsopts); else membuf = ffs_build_dinode2(&din.ffs2_din, &dirbuf, cur, root, fsopts); if (debug & DEBUG_FS_POPULATE_NODE) { printf("ffs_populate_dir: writing ino %d, %s", cur->inode->ino, inode_type(cur->type)); if (cur->inode->nlink > 1) printf(", nlink %d", cur->inode->nlink); putchar('\n'); } if (membuf != NULL) { ffs_write_file(&din, cur->inode->ino, membuf, fsopts); } else if (S_ISREG(cur->type)) { ffs_write_file(&din, cur->inode->ino, (cur->contents) ? cur->contents : path, fsopts); } else { assert (! S_ISDIR(cur->type)); ffs_write_inode(&din, cur->inode->ino, fsopts); } } /* * pass 3: write out sub-directories */ if (debug & DEBUG_FS_POPULATE) printf("ffs_populate_dir: PASS 3 dir %s\n", dir); for (cur = root; cur != NULL; cur = cur->next) { if (cur->child == NULL) continue; if (snprintf(path, sizeof(path), "%s/%s", dir, cur->name) >= sizeof(path)) errx(1, "Pathname too long."); if (! ffs_populate_dir(path, cur->child, fsopts)) return (0); } if (debug & DEBUG_FS_POPULATE) printf("ffs_populate_dir: DONE dir %s\n", dir); /* cleanup */ if (dirbuf.buf != NULL) free(dirbuf.buf); return (1); } static void ffs_write_file(union dinode *din, uint32_t ino, void *buf, fsinfo_t *fsopts) { int isfile, ffd; char *fbuf, *p; off_t bufleft, chunk, offset; ssize_t nread; struct inode in; struct buf * bp; ffs_opt_t *ffs_opts = fsopts->fs_specific; assert (din != NULL); assert (buf != NULL); assert (fsopts != NULL); assert (ffs_opts != NULL); isfile = S_ISREG(DIP(din, mode)); fbuf = NULL; ffd = -1; p = NULL; in.i_fs = (struct fs *)fsopts->superblock; if (debug & DEBUG_FS_WRITE_FILE) { printf( "ffs_write_file: ino %u, din %p, isfile %d, %s, size %lld", ino, din, isfile, inode_type(DIP(din, mode) & S_IFMT), (long long)DIP(din, size)); if (isfile) printf(", file '%s'\n", (char *)buf); else printf(", buffer %p\n", buf); } in.i_number = ino; in.i_size = DIP(din, size); if (ffs_opts->version == 1) memcpy(&in.i_din.ffs1_din, &din->ffs1_din, sizeof(in.i_din.ffs1_din)); else memcpy(&in.i_din.ffs2_din, &din->ffs2_din, sizeof(in.i_din.ffs2_din)); in.i_fd = fsopts->fd; if (DIP(din, size) == 0) goto write_inode_and_leave; /* mmm, cheating */ if (isfile) { if ((fbuf = malloc(ffs_opts->bsize)) == NULL) err(1, "Allocating memory for write buffer"); if ((ffd = open((char *)buf, O_RDONLY, 0444)) == -1) { warn("Can't open `%s' for reading", (char *)buf); goto leave_ffs_write_file; } } else { p = buf; } chunk = 0; for (bufleft = DIP(din, size); bufleft > 0; bufleft -= chunk) { chunk = MIN(bufleft, ffs_opts->bsize); if (!isfile) ; else if ((nread = read(ffd, fbuf, chunk)) == -1) err(EXIT_FAILURE, "Reading `%s', %lld bytes to go", (char *)buf, (long long)bufleft); else if (nread != chunk) errx(EXIT_FAILURE, "Reading `%s', %lld bytes to go, " "read %zd bytes, expected %ju bytes, does " "metalog size= attribute mismatch source size?", (char *)buf, (long long)bufleft, nread, (uintmax_t)chunk); else p = fbuf; offset = DIP(din, size) - bufleft; if (debug & DEBUG_FS_WRITE_FILE_BLOCK) printf( "ffs_write_file: write %p offset %lld size %lld left %lld\n", p, (long long)offset, (long long)chunk, (long long)bufleft); /* * XXX if holey support is desired, do the check here * * XXX might need to write out last bit in fragroundup * sized chunk. however, ffs_balloc() handles this for us */ errno = ffs_balloc(&in, offset, chunk, &bp); bad_ffs_write_file: if (errno != 0) err(1, "Writing inode %d (%s), bytes %lld + %lld", ino, isfile ? (char *)buf : inode_type(DIP(din, mode) & S_IFMT), (long long)offset, (long long)chunk); memcpy(bp->b_data, p, chunk); errno = bwrite(bp); if (errno != 0) goto bad_ffs_write_file; brelse(bp, 0); if (!isfile) p += chunk; } write_inode_and_leave: ffs_write_inode(&in.i_din, in.i_number, fsopts); leave_ffs_write_file: if (fbuf) free(fbuf); if (ffd != -1) close(ffd); } static void ffs_dump_dirbuf(dirbuf_t *dbuf, const char *dir, int needswap) { doff_t i; struct direct *de; uint16_t reclen; assert (dbuf != NULL); assert (dir != NULL); printf("ffs_dump_dirbuf: dir %s size %d cur %d\n", dir, dbuf->size, dbuf->cur); for (i = 0; i < dbuf->size; ) { de = (struct direct *)(dbuf->buf + i); reclen = ufs_rw16(de->d_reclen, needswap); printf( " inode %4d %7s offset %4d reclen %3d namlen %3d name %s\n", ufs_rw32(de->d_ino, needswap), inode_type(DTTOIF(de->d_type)), i, reclen, de->d_namlen, de->d_name); i += reclen; assert(reclen > 0); } } static void ffs_make_dirbuf(dirbuf_t *dbuf, const char *name, fsnode *node, int needswap) { struct direct de, *dp; uint16_t llen, reclen; u_char *newbuf; assert (dbuf != NULL); assert (name != NULL); assert (node != NULL); /* create direct entry */ (void)memset(&de, 0, sizeof(de)); de.d_ino = ufs_rw32(node->inode->ino, needswap); de.d_type = IFTODT(node->type); de.d_namlen = (uint8_t)strlen(name); strcpy(de.d_name, name); reclen = DIRSIZ_SWAP(0, &de, needswap); de.d_reclen = ufs_rw16(reclen, needswap); dp = (struct direct *)(dbuf->buf + dbuf->cur); llen = 0; if (dp != NULL) llen = DIRSIZ_SWAP(0, dp, needswap); if (debug & DEBUG_FS_MAKE_DIRBUF) printf( "ffs_make_dirbuf: dbuf siz %d cur %d lastlen %d\n" " ino %d type %d reclen %d namlen %d name %.30s\n", dbuf->size, dbuf->cur, llen, ufs_rw32(de.d_ino, needswap), de.d_type, reclen, de.d_namlen, de.d_name); if (reclen + dbuf->cur + llen > roundup(dbuf->size, DIRBLKSIZ)) { if (debug & DEBUG_FS_MAKE_DIRBUF) printf("ffs_make_dirbuf: growing buf to %d\n", dbuf->size + DIRBLKSIZ); if ((newbuf = realloc(dbuf->buf, dbuf->size + DIRBLKSIZ)) == NULL) err(1, "Allocating memory for directory buffer"); dbuf->buf = newbuf; dbuf->size += DIRBLKSIZ; memset(dbuf->buf + dbuf->size - DIRBLKSIZ, 0, DIRBLKSIZ); dbuf->cur = dbuf->size - DIRBLKSIZ; } else if (dp) { /* shrink end of previous */ dp->d_reclen = ufs_rw16(llen,needswap); dbuf->cur += llen; } dp = (struct direct *)(dbuf->buf + dbuf->cur); memcpy(dp, &de, reclen); dp->d_reclen = ufs_rw16(dbuf->size - dbuf->cur, needswap); } /* * cribbed from sys/ufs/ffs/ffs_alloc.c */ static void ffs_write_inode(union dinode *dp, uint32_t ino, const fsinfo_t *fsopts) { char *buf; struct ufs1_dinode *dp1; struct ufs2_dinode *dp2, *dip; struct cg *cgp; struct fs *fs; int cg, cgino, i; daddr_t d; char sbbuf[FFS_MAXBSIZE]; int32_t initediblk; ffs_opt_t *ffs_opts = fsopts->fs_specific; assert (dp != NULL); assert (ino > 0); assert (fsopts != NULL); assert (ffs_opts != NULL); fs = (struct fs *)fsopts->superblock; cg = ino_to_cg(fs, ino); cgino = ino % fs->fs_ipg; if (debug & DEBUG_FS_WRITE_INODE) printf("ffs_write_inode: din %p ino %u cg %d cgino %d\n", dp, ino, cg, cgino); ffs_rdfs(fsbtodb(fs, cgtod(fs, cg)), (int)fs->fs_cgsize, &sbbuf, fsopts); cgp = (struct cg *)sbbuf; if (!cg_chkmagic_swap(cgp, fsopts->needswap)) errx(1, "ffs_write_inode: cg %d: bad magic number", cg); assert (isclr(cg_inosused_swap(cgp, fsopts->needswap), cgino)); buf = malloc(fs->fs_bsize); if (buf == NULL) errx(1, "ffs_write_inode: cg %d: can't alloc inode block", cg); dp1 = (struct ufs1_dinode *)buf; dp2 = (struct ufs2_dinode *)buf; if (fs->fs_cstotal.cs_nifree == 0) errx(1, "ffs_write_inode: fs out of inodes for ino %u", ino); if (fs->fs_cs(fs, cg).cs_nifree == 0) errx(1, "ffs_write_inode: cg %d out of inodes for ino %u", cg, ino); setbit(cg_inosused_swap(cgp, fsopts->needswap), cgino); ufs_add32(cgp->cg_cs.cs_nifree, -1, fsopts->needswap); fs->fs_cstotal.cs_nifree--; fs->fs_cs(fs, cg).cs_nifree--; if (S_ISDIR(DIP(dp, mode))) { ufs_add32(cgp->cg_cs.cs_ndir, 1, fsopts->needswap); fs->fs_cstotal.cs_ndir++; fs->fs_cs(fs, cg).cs_ndir++; } /* * Initialize inode blocks on the fly for UFS2. */ initediblk = ufs_rw32(cgp->cg_initediblk, fsopts->needswap); if (ffs_opts->version == 2 && cgino + INOPB(fs) > initediblk && initediblk < ufs_rw32(cgp->cg_niblk, fsopts->needswap)) { memset(buf, 0, fs->fs_bsize); dip = (struct ufs2_dinode *)buf; for (i = 0; i < INOPB(fs); i++) { dip->di_gen = random(); dip++; } ffs_wtfs(fsbtodb(fs, ino_to_fsba(fs, cg * fs->fs_ipg + initediblk)), fs->fs_bsize, buf, fsopts); initediblk += INOPB(fs); cgp->cg_initediblk = ufs_rw32(initediblk, fsopts->needswap); } ffs_wtfs(fsbtodb(fs, cgtod(fs, cg)), (int)fs->fs_cgsize, &sbbuf, fsopts); /* now write inode */ d = fsbtodb(fs, ino_to_fsba(fs, ino)); ffs_rdfs(d, fs->fs_bsize, buf, fsopts); if (fsopts->needswap) { if (ffs_opts->version == 1) ffs_dinode1_swap(&dp->ffs1_din, &dp1[ino_to_fsbo(fs, ino)]); else ffs_dinode2_swap(&dp->ffs2_din, &dp2[ino_to_fsbo(fs, ino)]); } else { if (ffs_opts->version == 1) dp1[ino_to_fsbo(fs, ino)] = dp->ffs1_din; else dp2[ino_to_fsbo(fs, ino)] = dp->ffs2_din; } ffs_wtfs(d, fs->fs_bsize, buf, fsopts); free(buf); } void panic(const char *fmt, ...) { va_list ap; va_start(ap, fmt); vwarnx(fmt, ap); va_end(ap); exit(1); } Index: head/usr.sbin/makefs/makefs.h =================================================================== --- head/usr.sbin/makefs/makefs.h (revision 313779) +++ head/usr.sbin/makefs/makefs.h (revision 313780) @@ -1,287 +1,287 @@ /* $NetBSD: makefs.h,v 1.20 2008/12/28 21:51:46 christos Exp $ */ /* * Copyright (c) 2001 Wasabi Systems, Inc. * All rights reserved. * * Written by Luke Mewburn for Wasabi Systems, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed for the NetBSD Project by * Wasabi Systems, Inc. * 4. The name of Wasabi Systems, Inc. may not be used to endorse * or promote products derived from this software without specific prior * written permission. * * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL WASABI SYSTEMS, INC * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _MAKEFS_H #define _MAKEFS_H #include #include /* * fsnode - * a component of the tree; contains a filename, a pointer to * fsinode, optional symlink name, and tree pointers * * fsinode - * equivalent to an inode, containing target file system inode number, * refcount (nlink), and stat buffer * * A tree of fsnodes looks like this: * * name "." "bin" "netbsd" * type S_IFDIR S_IFDIR S_IFREG * next > > NULL * parent NULL NULL NULL * child NULL v * * name "." "ls" * type S_IFDIR S_IFREG * next > NULL * parent ^ ^ (to "bin") * child NULL NULL * * Notes: * - first always points to first entry, at current level, which * must be "." when the tree has been built; during build it may * not be if "." hasn't yet been found by readdir(2). */ enum fi_flags { FI_SIZED = 1<<0, /* inode sized */ FI_ALLOCATED = 1<<1, /* fsinode->ino allocated */ FI_WRITTEN = 1<<2, /* inode written */ }; typedef struct { uint32_t ino; /* inode number used on target fs */ uint32_t nlink; /* number of links to this entry */ enum fi_flags flags; /* flags used by fs specific code */ struct stat st; /* stat entry */ } fsinode; typedef struct _fsnode { struct _fsnode *parent; /* parent (NULL if root) */ struct _fsnode *child; /* child (if type == S_IFDIR) */ struct _fsnode *next; /* next */ struct _fsnode *first; /* first node of current level (".") */ uint32_t type; /* type of entry */ fsinode *inode; /* actual inode data */ char *symlink; /* symlink target */ char *contents; /* file to provide contents */ const char *root; /* root path */ char *path; /* directory name */ char *name; /* file name */ int flags; /* misc flags */ } fsnode; #define FSNODE_F_HASSPEC 0x01 /* fsnode has a spec entry */ #define FSNODE_F_OPTIONAL 0x02 /* fsnode is optional */ /* * fsinfo_t - contains various settings and parameters pertaining to * the image, including current settings, global options, and fs * specific options */ typedef struct { /* current settings */ off_t size; /* total size */ off_t inodes; /* number of inodes */ uint32_t curinode; /* current inode */ /* image settings */ int fd; /* file descriptor of image */ void *superblock; /* superblock */ int onlyspec; /* only add entries in specfile */ /* global options */ off_t minsize; /* minimum size image should be */ off_t maxsize; /* maximum size image can be */ off_t freefiles; /* free file entries to leave */ int freefilepc; /* free file % */ off_t freeblocks; /* free blocks to leave */ int freeblockpc; /* free block % */ int needswap; /* non-zero if byte swapping needed */ int sectorsize; /* sector size */ int sparse; /* sparse image, don't fill it with zeros */ off_t roundup; /* round image size up to this value */ void *fs_specific; /* File system specific additions. */ } fsinfo_t; /* * option_t - contains option name, description, pointer to location to store * result, and range checks for the result. Used to simplify fs specific * option setting */ typedef struct { const char *name; /* option name */ int *value; /* where to stuff the value */ int minimum; /* minimum for value */ int maximum; /* maximum for value */ const char *desc; /* option description */ } option_t; void apply_specfile(const char *, const char *, fsnode *, int); void dump_fsnodes(fsnode *); const char * inode_type(mode_t); fsnode * read_mtree(const char *, fsnode *); int set_option(option_t *, const char *, const char *); fsnode * walk_dir(const char *, const char *, fsnode *, fsnode *); void free_fsnodes(fsnode *); void ffs_prep_opts(fsinfo_t *); int ffs_parse_opts(const char *, fsinfo_t *); void ffs_cleanup_opts(fsinfo_t *); void ffs_makefs(const char *, const char *, fsnode *, fsinfo_t *); void cd9660_prep_opts(fsinfo_t *); int cd9660_parse_opts(const char *, fsinfo_t *); void cd9660_cleanup_opts(fsinfo_t *); void cd9660_makefs(const char *, const char *, fsnode *, fsinfo_t *); extern u_int debug; extern int dupsok; extern struct timespec start_time; extern struct stat stampst; /* * If -x is specified, we want to exclude nodes which do not appear * in the spec file. */ #define FSNODE_EXCLUDE_P(opts, fsnode) \ ((opts)->onlyspec != 0 && ((fsnode)->flags & FSNODE_F_HASSPEC) == 0) #define DEBUG_TIME 0x00000001 /* debug bits 1..3 unused at this time */ #define DEBUG_WALK_DIR 0x00000010 #define DEBUG_WALK_DIR_NODE 0x00000020 #define DEBUG_WALK_DIR_LINKCHECK 0x00000040 #define DEBUG_DUMP_FSNODES 0x00000080 #define DEBUG_DUMP_FSNODES_VERBOSE 0x00000100 #define DEBUG_FS_PARSE_OPTS 0x00000200 #define DEBUG_FS_MAKEFS 0x00000400 #define DEBUG_FS_VALIDATE 0x00000800 #define DEBUG_FS_CREATE_IMAGE 0x00001000 #define DEBUG_FS_SIZE_DIR 0x00002000 #define DEBUG_FS_SIZE_DIR_NODE 0x00004000 #define DEBUG_FS_SIZE_DIR_ADD_DIRENT 0x00008000 #define DEBUG_FS_POPULATE 0x00010000 #define DEBUG_FS_POPULATE_DIRBUF 0x00020000 #define DEBUG_FS_POPULATE_NODE 0x00040000 #define DEBUG_FS_WRITE_FILE 0x00080000 #define DEBUG_FS_WRITE_FILE_BLOCK 0x00100000 #define DEBUG_FS_MAKE_DIRBUF 0x00200000 #define DEBUG_FS_WRITE_INODE 0x00400000 #define DEBUG_BUF_BREAD 0x00800000 #define DEBUG_BUF_BWRITE 0x01000000 #define DEBUG_BUF_GETBLK 0x02000000 #define DEBUG_APPLY_SPECFILE 0x04000000 #define DEBUG_APPLY_SPECENTRY 0x08000000 #define DEBUG_APPLY_SPECONLY 0x10000000 #define TIMER_START(x) \ if (debug & DEBUG_TIME) \ gettimeofday(&(x), NULL) #define TIMER_RESULTS(x,d) \ if (debug & DEBUG_TIME) { \ struct timeval end, td; \ gettimeofday(&end, NULL); \ timersub(&end, &(x), &td); \ printf("%s took %lld.%06ld seconds\n", \ (d), (long long)td.tv_sec, \ (long)td.tv_usec); \ } #ifndef DEFAULT_FSTYPE #define DEFAULT_FSTYPE "ffs" #endif /* * ffs specific settings * --------------------- */ #define FFS_EI /* for opposite endian support in ffs headers */ /* * Write-arounds/compat shims for endian-agnostic support. * These belong in the kernel if/when it's possible to mount * filesystems w/ either byte order. */ /* * File system internal flags, also in fs_flags. * (Pick highest number to avoid conflicts with others) */ #define FS_SWAPPED 0x80000000 /* file system is endian swapped */ #define FS_INTERNAL 0x80000000 /* mask for internal flags */ #define FS_ISCLEAN 1 #define DINODE1_SIZE (sizeof(struct ufs1_dinode)) #define DINODE2_SIZE (sizeof(struct ufs2_dinode)) -#define MAXSYMLINKLEN_UFS1 ((NDADDR + NIADDR) * sizeof(ufs1_daddr_t)) -#define MAXSYMLINKLEN_UFS2 ((NDADDR + NIADDR) * sizeof(ufs2_daddr_t)) +#define UFS1_MAXSYMLINKLEN ((UFS_NDADDR + UFS_NIADDR) * sizeof(ufs1_daddr_t)) +#define UFS2_MAXSYMLINKLEN ((UFS_NDADDR + UFS_NIADDR) * sizeof(ufs2_daddr_t)) #if (BYTE_ORDER == LITTLE_ENDIAN) #define DIRSIZ_SWAP(oldfmt, dp, needswap) \ (((oldfmt) && !(needswap)) ? \ DIRECTSIZ((dp)->d_type) : DIRECTSIZ((dp)->d_namlen)) #else #define DIRSIZ_SWAP(oldfmt, dp, needswap) \ (((oldfmt) && (needswap)) ? \ DIRECTSIZ((dp)->d_type) : DIRECTSIZ((dp)->d_namlen)) #endif #define cg_chkmagic_swap(cgp, ns) \ (ufs_rw32((cgp)->cg_magic, (ns)) == CG_MAGIC) #define cg_inosused_swap(cgp, ns) \ ((u_int8_t *)((u_int8_t *)(cgp) + ufs_rw32((cgp)->cg_iusedoff, (ns)))) #define cg_blksfree_swap(cgp, ns) \ ((u_int8_t *)((u_int8_t *)(cgp) + ufs_rw32((cgp)->cg_freeoff, (ns)))) #define cg_clustersfree_swap(cgp, ns) \ ((u_int8_t *)((u_int8_t *)(cgp) + ufs_rw32((cgp)->cg_clusteroff, (ns)))) #define cg_clustersum_swap(cgp, ns) \ ((int32_t *)((uintptr_t)(cgp) + ufs_rw32((cgp)->cg_clustersumoff, ns))) struct fs; void ffs_fragacct_swap(struct fs *, int, int32_t [], int, int); fsinode *link_check(fsinode *); #endif /* _MAKEFS_H */ Index: head/usr.sbin/quot/quot.c =================================================================== --- head/usr.sbin/quot/quot.c (revision 313779) +++ head/usr.sbin/quot/quot.c (revision 313780) @@ -1,644 +1,644 @@ /* * Copyright (C) 1991, 1994 Wolfgang Solfrank. * Copyright (C) 1991, 1994 TooLs GmbH. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by TooLs GmbH. * 4. The name of TooLs GmbH may not be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY TOOLS GMBH ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL TOOLS GMBH BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* some flags of what to do: */ static char estimate; static char count; static char unused; static void (*func)(int, struct fs *, char *); static long blocksize; static char *header; static int headerlen; static union dinode *get_inode(int, struct fs *, ino_t); static int virtualblocks(struct fs *, union dinode *); static int isfree(struct fs *, union dinode *); static void inituser(void); static void usrrehash(void); static struct user *user(uid_t); static int cmpusers(const void *, const void *); static void uses(uid_t, daddr_t, time_t); static void initfsizes(void); static void dofsizes(int, struct fs *, char *); static void douser(int, struct fs *, char *); static void donames(int, struct fs *, char *); static void usage(void); static void quot(char *, char *); /* * Original BSD quot doesn't round to number of frags/blocks, * doesn't account for indirection blocks and gets it totally * wrong if the size is a multiple of the blocksize. * The new code always counts the number of 512 byte blocks * instead of the number of kilobytes and converts them to * kByte when done (on request). * * Due to the size of modern disks, we must cast intermediate * values to 64 bits to prevent potential overflows. */ #ifdef COMPAT #define SIZE(n) (n) #else #define SIZE(n) ((int)(((quad_t)(n) * 512 + blocksize - 1)/blocksize)) #endif #define INOCNT(fs) ((fs)->fs_ipg) #define INOSZ(fs) \ (((fs)->fs_magic == FS_UFS1_MAGIC ? sizeof(struct ufs1_dinode) : \ sizeof(struct ufs2_dinode)) * INOCNT(fs)) union dinode { struct ufs1_dinode dp1; struct ufs2_dinode dp2; }; #define DIP(fs, dp, field) \ (((fs)->fs_magic == FS_UFS1_MAGIC) ? \ (dp)->dp1.field : (dp)->dp2.field) static union dinode * get_inode(int fd, struct fs *super, ino_t ino) { static caddr_t ipbuf; static struct cg *cgp; static ino_t last; static int cg; struct ufs2_dinode *di2; if (fd < 0) { /* flush cache */ if (ipbuf) { free(ipbuf); ipbuf = 0; if (super != NULL && super->fs_magic == FS_UFS2_MAGIC) { free(cgp); cgp = 0; } } return 0; } if (!ipbuf || ino < last || ino >= last + INOCNT(super)) { if (super->fs_magic == FS_UFS2_MAGIC && (!cgp || cg != ino_to_cg(super, ino))) { cg = ino_to_cg(super, ino); if (!cgp && !(cgp = malloc(super->fs_cgsize))) errx(1, "allocate cg"); if (lseek(fd, (off_t)cgtod(super, cg) << super->fs_fshift, 0) < 0) err(1, "lseek cg"); if (read(fd, cgp, super->fs_cgsize) != super->fs_cgsize) err(1, "read cg"); if (!cg_chkmagic(cgp)) errx(1, "cg has bad magic"); } if (!ipbuf && !(ipbuf = malloc(INOSZ(super)))) errx(1, "allocate inodes"); last = rounddown(ino, INOCNT(super)); if (lseek(fd, (off_t)ino_to_fsba(super, last) << super->fs_fshift, 0) < (off_t)0 || read(fd, ipbuf, INOSZ(super)) != (ssize_t)INOSZ(super)) err(1, "read inodes"); } if (super->fs_magic == FS_UFS1_MAGIC) return ((union dinode *) &((struct ufs1_dinode *)ipbuf)[ino % INOCNT(super)]); di2 = &((struct ufs2_dinode *)ipbuf)[ino % INOCNT(super)]; /* If the inode is unused, it might be unallocated too, so zero it. */ if (isclr(cg_inosused(cgp), ino % super->fs_ipg)) bzero(di2, sizeof (*di2)); return ((union dinode *)di2); } #ifdef COMPAT #define actualblocks(fs, dp) (DIP(fs, dp, di_blocks) / 2) #else #define actualblocks(fs, dp) DIP(fs, dp, di_blocks) #endif static int virtualblocks(struct fs *super, union dinode *dp) { off_t nblk, sz; sz = DIP(super, dp, di_size); #ifdef COMPAT - if (lblkno(super,sz) >= NDADDR) { + if (lblkno(super,sz) >= UFS_NDADDR) { nblk = blkroundup(super,sz); if (sz == nblk) nblk += super->fs_bsize; } return sz / 1024; #else /* COMPAT */ - if (lblkno(super,sz) >= NDADDR) { + if (lblkno(super,sz) >= UFS_NDADDR) { nblk = blkroundup(super,sz); sz = lblkno(super,nblk); - sz = (sz - NDADDR + NINDIR(super) - 1) / NINDIR(super); + sz = (sz - UFS_NDADDR + NINDIR(super) - 1) / NINDIR(super); while (sz > 0) { nblk += sz * super->fs_bsize; /* sz - 1 rounded up */ sz = (sz - 1 + NINDIR(super) - 1) / NINDIR(super); } } else nblk = fragroundup(super,sz); return nblk / 512; #endif /* COMPAT */ } static int isfree(struct fs *super, union dinode *dp) { #ifdef COMPAT return (DIP(super, dp, di_mode) & IFMT) == 0; #else /* COMPAT */ switch (DIP(super, dp, di_mode) & IFMT) { case IFIFO: case IFLNK: /* should check FASTSYMLINK? */ case IFDIR: case IFREG: return 0; case IFCHR: case IFBLK: case IFSOCK: case IFWHT: case 0: return 1; default: errx(1, "unknown IFMT 0%o", DIP(super, dp, di_mode) & IFMT); } #endif } static struct user { uid_t uid; char *name; daddr_t space; long count; daddr_t spc30; daddr_t spc60; daddr_t spc90; } *users; static int nusers; static void inituser(void) { int i; struct user *usr; if (!nusers) { nusers = 8; if (!(users = (struct user *)calloc(nusers,sizeof(struct user)))) errx(1, "allocate users"); } else { for (usr = users, i = nusers; --i >= 0; usr++) { usr->space = usr->spc30 = usr->spc60 = usr->spc90 = 0; usr->count = 0; } } } static void usrrehash(void) { int i; struct user *usr, *usrn; struct user *svusr; svusr = users; nusers <<= 1; if (!(users = (struct user *)calloc(nusers,sizeof(struct user)))) errx(1, "allocate users"); for (usr = svusr, i = nusers >> 1; --i >= 0; usr++) { for (usrn = users + (usr->uid&(nusers - 1)); usrn->name; usrn--) { if (usrn <= users) usrn = users + nusers; } *usrn = *usr; } } static struct user * user(uid_t uid) { struct user *usr; int i; struct passwd *pwd; while (1) { for (usr = users + (uid&(nusers - 1)), i = nusers; --i >= 0; usr--) { if (!usr->name) { usr->uid = uid; if (!(pwd = getpwuid(uid))) { if ((usr->name = (char *)malloc(7))) sprintf(usr->name,"#%d",uid); } else { if ((usr->name = (char *) malloc(strlen(pwd->pw_name) + 1))) strcpy(usr->name,pwd->pw_name); } if (!usr->name) errx(1, "allocate users"); return usr; } else if (usr->uid == uid) return usr; if (usr <= users) usr = users + nusers; } usrrehash(); } } static int cmpusers(const void *v1, const void *v2) { const struct user *u1, *u2; u1 = (const struct user *)v1; u2 = (const struct user *)v2; return u2->space - u1->space; } #define sortusers(users) (qsort((users),nusers,sizeof(struct user), \ cmpusers)) static void uses(uid_t uid, daddr_t blks, time_t act) { static time_t today; struct user *usr; if (!today) time(&today); usr = user(uid); usr->count++; usr->space += blks; if (today - act > 90L * 24L * 60L * 60L) usr->spc90 += blks; if (today - act > 60L * 24L * 60L * 60L) usr->spc60 += blks; if (today - act > 30L * 24L * 60L * 60L) usr->spc30 += blks; } #ifdef COMPAT #define FSZCNT 500 #else #define FSZCNT 512 #endif struct fsizes { struct fsizes *fsz_next; daddr_t fsz_first, fsz_last; ino_t fsz_count[FSZCNT]; daddr_t fsz_sz[FSZCNT]; } *fsizes; static void initfsizes(void) { struct fsizes *fp; int i; for (fp = fsizes; fp; fp = fp->fsz_next) { for (i = FSZCNT; --i >= 0;) { fp->fsz_count[i] = 0; fp->fsz_sz[i] = 0; } } } static void dofsizes(int fd, struct fs *super, char *name) { ino_t inode, maxino; union dinode *dp; daddr_t sz, ksz; struct fsizes *fp, **fsp; int i; maxino = super->fs_ncg * super->fs_ipg - 1; #ifdef COMPAT if (!(fsizes = (struct fsizes *)malloc(sizeof(struct fsizes)))) errx(1, "allocate fsize structure"); #endif /* COMPAT */ for (inode = 0; inode < maxino; inode++) { errno = 0; if ((dp = get_inode(fd,super,inode)) #ifdef COMPAT && ((DIP(super, dp, di_mode) & IFMT) == IFREG || (DIP(super, dp, di_mode) & IFMT) == IFDIR) #else /* COMPAT */ && !isfree(super, dp) #endif /* COMPAT */ ) { sz = estimate ? virtualblocks(super, dp) : actualblocks(super, dp); #ifdef COMPAT if (sz >= FSZCNT) { fsizes->fsz_count[FSZCNT-1]++; fsizes->fsz_sz[FSZCNT-1] += sz; } else { fsizes->fsz_count[sz]++; fsizes->fsz_sz[sz] += sz; } #else /* COMPAT */ ksz = SIZE(sz); for (fsp = &fsizes; (fp = *fsp); fsp = &fp->fsz_next) { if (ksz < fp->fsz_last) break; } if (!fp || ksz < fp->fsz_first) { if (!(fp = (struct fsizes *) malloc(sizeof(struct fsizes)))) errx(1, "allocate fsize structure"); fp->fsz_next = *fsp; *fsp = fp; fp->fsz_first = rounddown(ksz, FSZCNT); fp->fsz_last = fp->fsz_first + FSZCNT; for (i = FSZCNT; --i >= 0;) { fp->fsz_count[i] = 0; fp->fsz_sz[i] = 0; } } fp->fsz_count[ksz % FSZCNT]++; fp->fsz_sz[ksz % FSZCNT] += sz; #endif /* COMPAT */ } else if (errno) { err(1, "%s", name); } } sz = 0; for (fp = fsizes; fp; fp = fp->fsz_next) { for (i = 0; i < FSZCNT; i++) { if (fp->fsz_count[i]) printf("%jd\t%jd\t%d\n", (intmax_t)(fp->fsz_first + i), (intmax_t)fp->fsz_count[i], SIZE(sz += fp->fsz_sz[i])); } } } static void douser(int fd, struct fs *super, char *name) { ino_t inode, maxino; struct user *usr, *usrs; union dinode *dp; int n; maxino = super->fs_ncg * super->fs_ipg - 1; for (inode = 0; inode < maxino; inode++) { errno = 0; if ((dp = get_inode(fd,super,inode)) && !isfree(super, dp)) uses(DIP(super, dp, di_uid), estimate ? virtualblocks(super, dp) : actualblocks(super, dp), DIP(super, dp, di_atime)); else if (errno) { err(1, "%s", name); } } if (!(usrs = (struct user *)malloc(nusers * sizeof(struct user)))) errx(1, "allocate users"); bcopy(users,usrs,nusers * sizeof(struct user)); sortusers(usrs); for (usr = usrs, n = nusers; --n >= 0 && usr->count; usr++) { printf("%5d",SIZE(usr->space)); if (count) printf("\t%5ld",usr->count); printf("\t%-8s",usr->name); if (unused) printf("\t%5d\t%5d\t%5d", SIZE(usr->spc30), SIZE(usr->spc60), SIZE(usr->spc90)); printf("\n"); } free(usrs); } static void donames(int fd, struct fs *super, char *name) { int c; ino_t maxino; uintmax_t inode; union dinode *dp; maxino = super->fs_ncg * super->fs_ipg - 1; /* first skip the name of the filesystem */ while ((c = getchar()) != EOF && (c < '0' || c > '9')) while ((c = getchar()) != EOF && c != '\n'); ungetc(c,stdin); while (scanf("%ju", &inode) == 1) { if (inode > maxino) { warnx("illegal inode %ju", inode); return; } errno = 0; if ((dp = get_inode(fd,super,inode)) && !isfree(super, dp)) { printf("%s\t",user(DIP(super, dp, di_uid))->name); /* now skip whitespace */ while ((c = getchar()) == ' ' || c == '\t'); /* and print out the remainder of the input line */ while (c != EOF && c != '\n') { putchar(c); c = getchar(); } putchar('\n'); } else { if (errno) { err(1, "%s", name); } /* skip this line */ while ((c = getchar()) != EOF && c != '\n'); } if (c == EOF) break; } } static void usage(void) { #ifdef COMPAT fprintf(stderr,"usage: quot [-nfcvha] [filesystem ...]\n"); #else /* COMPAT */ fprintf(stderr,"usage: quot [-acfhknv] [filesystem ...]\n"); #endif /* COMPAT */ exit(1); } /* * Possible superblock locations ordered from most to least likely. */ static int sblock_try[] = SBLOCKSEARCH; static char superblock[SBLOCKSIZE]; void quot(char *name, char *mp) { int i, fd; struct fs *fs; get_inode(-1, NULL, 0); /* flush cache */ inituser(); initfsizes(); if ((fd = open(name,0)) < 0) { warn("%s", name); close(fd); return; } for (i = 0; sblock_try[i] != -1; i++) { if (lseek(fd, sblock_try[i], 0) != sblock_try[i]) { close(fd); return; } if (read(fd, superblock, SBLOCKSIZE) != SBLOCKSIZE) { close(fd); return; } fs = (struct fs *)superblock; if ((fs->fs_magic == FS_UFS1_MAGIC || (fs->fs_magic == FS_UFS2_MAGIC && fs->fs_sblockloc == sblock_try[i])) && fs->fs_bsize <= MAXBSIZE && fs->fs_bsize >= sizeof(struct fs)) break; } if (sblock_try[i] == -1) { warnx("%s: not a BSD filesystem",name); close(fd); return; } printf("%s:",name); if (mp) printf(" (%s)",mp); putchar('\n'); (*func)(fd, fs, name); close(fd); } int main(int argc, char *argv[]) { char all = 0; struct statfs *mp; struct fstab *fs; int cnt; func = douser; #ifndef COMPAT header = getbsize(&headerlen,&blocksize); #endif while (--argc > 0 && **++argv == '-') { while (*++*argv) { switch (**argv) { case 'n': func = donames; break; case 'c': func = dofsizes; break; case 'a': all = 1; break; case 'f': count = 1; break; case 'h': estimate = 1; break; #ifndef COMPAT case 'k': blocksize = 1024; break; #endif /* COMPAT */ case 'v': unused = 1; break; default: usage(); } } } if (all) { cnt = getmntinfo(&mp,MNT_NOWAIT); for (; --cnt >= 0; mp++) { if (!strncmp(mp->f_fstypename, "ufs", MFSNAMELEN)) quot(mp->f_mntfromname, mp->f_mntonname); } } while (--argc >= 0) { if ((fs = getfsfile(*argv)) != NULL) quot(fs->fs_spec, 0); else quot(*argv,0); argv++; } return 0; }