Index: head/sys/fs/msdosfs/fat.h =================================================================== --- head/sys/fs/msdosfs/fat.h (revision 352232) +++ head/sys/fs/msdosfs/fat.h (revision 352233) @@ -1,109 +1,115 @@ /* $FreeBSD$ */ /* $NetBSD: fat.h,v 1.12 1997/11/17 15:36:36 ws Exp $ */ /*- * SPDX-License-Identifier: BSD-4-Clause * * Copyright (C) 1994, 1997 Wolfgang Solfrank. * Copyright (C) 1994, 1997 TooLs GmbH. * All rights reserved. * Original code by Paul Popelka (paulp@uts.amdahl.com) (see below). * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by TooLs GmbH. * 4. The name of TooLs GmbH may not be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY TOOLS GMBH ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL TOOLS GMBH BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /*- * Written by Paul Popelka (paulp@uts.amdahl.com) * * You can do anything you want with this software, just don't say you wrote * it, and don't remove this notice. * * This software is provided "as is". * * The author supplies this software to be publicly redistributed on the * understanding that the author is not responsible for the correct * functioning of this software in any circumstances and is not liable for * any damages caused by this software. * * October 1992 */ #ifndef _FS_MSDOSFS_FAT_H_ #define _FS_MSDOSFS_FAT_H_ /* * Some useful cluster numbers. */ #define MSDOSFSROOT 0 /* cluster 0 means the root dir */ #define CLUST_FREE 0 /* cluster 0 also means a free cluster */ #define MSDOSFSFREE CLUST_FREE #define CLUST_FIRST 2 /* first legal cluster number */ #define CLUST_RSRVD 0xfffffff6 /* reserved cluster range */ #define CLUST_BAD 0xfffffff7 /* a cluster with a defect */ #define CLUST_EOFS 0xfffffff8 /* start of eof cluster range */ #define CLUST_EOFE 0xffffffff /* end of eof cluster range */ #define FAT12_MASK 0x00000fff /* mask for 12 bit cluster numbers */ #define FAT16_MASK 0x0000ffff /* mask for 16 bit cluster numbers */ #define FAT32_MASK 0x0fffffff /* mask for FAT32 cluster numbers */ /* * MSDOSFS: * Return true if filesystem uses 12 bit FATs. Microsoft Programmer's * Reference says if the maximum cluster number in a filesystem is greater * than 4084 ((CLUST_RSRVD - CLUST_FIRST) & FAT12_MASK) then we've got a * 16 bit FAT filesystem. While mounting, the result of this test is stored * in pm_fatentrysize. */ #define FAT12(pmp) (pmp->pm_fatmask == FAT12_MASK) #define FAT16(pmp) (pmp->pm_fatmask == FAT16_MASK) #define FAT32(pmp) (pmp->pm_fatmask == FAT32_MASK) #define MSDOSFSEOF(pmp, cn) ((((cn) | ~(pmp)->pm_fatmask) & CLUST_EOFS) == CLUST_EOFS) #if defined (_KERNEL) || defined(MAKEFS) /* * These are the values for the function argument to the function * fatentry(). */ #define FAT_GET 0x0001 /* get a FAT entry */ #define FAT_SET 0x0002 /* set a FAT entry */ #define FAT_GET_AND_SET (FAT_GET | FAT_SET) /* * Flags to extendfile: */ #define DE_CLEAR 1 /* Zero out the blocks allocated */ int pcbmap(struct denode *dep, u_long findcn, daddr_t *bnp, u_long *cnp, int* sp); int clusterfree(struct msdosfsmount *pmp, u_long cn, u_long *oldcnp); int clusteralloc(struct msdosfsmount *pmp, u_long start, u_long count, u_long fillwith, u_long *retcluster, u_long *got); int fatentry(int function, struct msdosfsmount *pmp, u_long cluster, u_long *oldcontents, u_long newcontents); int freeclusterchain(struct msdosfsmount *pmp, u_long startchain); int extendfile(struct denode *dep, u_long count, struct buf **bpp, u_long *ncp, int flags); void fc_purge(struct denode *dep, u_int frcn); -int markvoldirty(struct msdosfsmount *pmp, int dirty); +int markvoldirty_upgrade(struct msdosfsmount *pmp, bool dirty, bool rw_upgrade); + +static inline int +markvoldirty(struct msdosfsmount *pmp, bool dirty) +{ + return (markvoldirty_upgrade(pmp, dirty, false)); +} #endif /* _KERNEL || MAKEFS */ #endif /* !_FS_MSDOSFS_FAT_H_ */ Index: head/sys/fs/msdosfs/msdosfs_fat.c =================================================================== --- head/sys/fs/msdosfs/msdosfs_fat.c (revision 352232) +++ head/sys/fs/msdosfs/msdosfs_fat.c (revision 352233) @@ -1,1172 +1,1198 @@ /* $FreeBSD$ */ /* $NetBSD: msdosfs_fat.c,v 1.28 1997/11/17 15:36:49 ws Exp $ */ /*- * SPDX-License-Identifier: BSD-4-Clause * * Copyright (C) 1994, 1995, 1997 Wolfgang Solfrank. * Copyright (C) 1994, 1995, 1997 TooLs GmbH. * All rights reserved. * Original code by Paul Popelka (paulp@uts.amdahl.com) (see below). * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by TooLs GmbH. * 4. The name of TooLs GmbH may not be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY TOOLS GMBH ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL TOOLS GMBH BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /*- * Written by Paul Popelka (paulp@uts.amdahl.com) * * You can do anything you want with this software, just don't say you wrote * it, and don't remove this notice. * * This software is provided "as is". * * The author supplies this software to be publicly redistributed on the * understanding that the author is not responsible for the correct * functioning of this software in any circumstances and is not liable for * any damages caused by this software. * * October 1992 */ #include #include #include #include #include #include #include #include #include #include #include #define FULL_RUN ((u_int)0xffffffff) static int chainalloc(struct msdosfsmount *pmp, u_long start, u_long count, u_long fillwith, u_long *retcluster, u_long *got); static int chainlength(struct msdosfsmount *pmp, u_long start, u_long count); static void fatblock(struct msdosfsmount *pmp, u_long ofs, u_long *bnp, u_long *sizep, u_long *bop); static int fatchain(struct msdosfsmount *pmp, u_long start, u_long count, u_long fillwith); static void fc_lookup(struct denode *dep, u_long findcn, u_long *frcnp, u_long *fsrcnp); static void updatefats(struct msdosfsmount *pmp, struct buf *bp, u_long fatbn); static __inline void usemap_alloc(struct msdosfsmount *pmp, u_long cn); static __inline void usemap_free(struct msdosfsmount *pmp, u_long cn); static int clusteralloc1(struct msdosfsmount *pmp, u_long start, u_long count, u_long fillwith, u_long *retcluster, u_long *got); static void fatblock(struct msdosfsmount *pmp, u_long ofs, u_long *bnp, u_long *sizep, u_long *bop) { u_long bn, size; bn = ofs / pmp->pm_fatblocksize * pmp->pm_fatblocksec; size = min(pmp->pm_fatblocksec, pmp->pm_FATsecs - bn) * DEV_BSIZE; bn += pmp->pm_fatblk + pmp->pm_curfat * pmp->pm_FATsecs; if (bnp) *bnp = bn; if (sizep) *sizep = size; if (bop) *bop = ofs % pmp->pm_fatblocksize; } /* * Map the logical cluster number of a file into a physical disk sector * that is filesystem relative. * * dep - address of denode representing the file of interest * findcn - file relative cluster whose filesystem relative cluster number * and/or block number are/is to be found * bnp - address of where to place the filesystem relative block number. * If this pointer is null then don't return this quantity. * cnp - address of where to place the filesystem relative cluster number. * If this pointer is null then don't return this quantity. * sp - pointer to returned block size * * NOTE: Either bnp or cnp must be non-null. * This function has one side effect. If the requested file relative cluster * is beyond the end of file, then the actual number of clusters in the file * is returned in *cnp. This is useful for determining how long a directory is. * If cnp is null, nothing is returned. */ int pcbmap(struct denode *dep, u_long findcn, daddr_t *bnp, u_long *cnp, int *sp) { int error; u_long i; u_long cn; u_long prevcn = 0; /* XXX: prevcn could be used unititialized */ u_long byteoffset; u_long bn; u_long bo; struct buf *bp = NULL; u_long bp_bn = -1; struct msdosfsmount *pmp = dep->de_pmp; u_long bsize; KASSERT(bnp != NULL || cnp != NULL || sp != NULL, ("pcbmap: extra call")); ASSERT_VOP_ELOCKED(DETOV(dep), "pcbmap"); cn = dep->de_StartCluster; /* * The "file" that makes up the root directory is contiguous, * permanently allocated, of fixed size, and is not made up of * clusters. If the cluster number is beyond the end of the root * directory, then return the number of clusters in the file. */ if (cn == MSDOSFSROOT) { if (dep->de_Attributes & ATTR_DIRECTORY) { if (de_cn2off(pmp, findcn) >= dep->de_FileSize) { if (cnp) *cnp = de_bn2cn(pmp, pmp->pm_rootdirsize); return (E2BIG); } if (bnp) *bnp = pmp->pm_rootdirblk + de_cn2bn(pmp, findcn); if (cnp) *cnp = MSDOSFSROOT; if (sp) *sp = min(pmp->pm_bpcluster, dep->de_FileSize - de_cn2off(pmp, findcn)); return (0); } else { /* just an empty file */ if (cnp) *cnp = 0; return (E2BIG); } } /* * All other files do I/O in cluster sized blocks */ if (sp) *sp = pmp->pm_bpcluster; /* * Rummage around in the FAT cache, maybe we can avoid tromping * through every FAT entry for the file. And, keep track of how far * off the cache was from where we wanted to be. */ i = 0; fc_lookup(dep, findcn, &i, &cn); /* * Handle all other files or directories the normal way. */ for (; i < findcn; i++) { /* * Stop with all reserved clusters, not just with EOF. */ if ((cn | ~pmp->pm_fatmask) >= CLUST_RSRVD) goto hiteof; byteoffset = FATOFS(pmp, cn); fatblock(pmp, byteoffset, &bn, &bsize, &bo); if (bn != bp_bn) { if (bp) brelse(bp); error = bread(pmp->pm_devvp, bn, bsize, NOCRED, &bp); if (error) { return (error); } bp_bn = bn; } prevcn = cn; if (bo >= bsize) { if (bp) brelse(bp); return (EIO); } if (FAT32(pmp)) cn = getulong(bp->b_data + bo); else cn = getushort(bp->b_data + bo); if (FAT12(pmp) && (prevcn & 1)) cn >>= 4; cn &= pmp->pm_fatmask; /* * Force the special cluster numbers * to be the same for all cluster sizes * to let the rest of msdosfs handle * all cases the same. */ if ((cn | ~pmp->pm_fatmask) >= CLUST_RSRVD) cn |= ~pmp->pm_fatmask; } if (!MSDOSFSEOF(pmp, cn)) { if (bp) brelse(bp); if (bnp) *bnp = cntobn(pmp, cn); if (cnp) *cnp = cn; fc_setcache(dep, FC_LASTMAP, i, cn); return (0); } hiteof:; if (cnp) *cnp = i; if (bp) brelse(bp); /* update last file cluster entry in the FAT cache */ fc_setcache(dep, FC_LASTFC, i - 1, prevcn); return (E2BIG); } /* * Find the closest entry in the FAT cache to the cluster we are looking * for. */ static void fc_lookup(struct denode *dep, u_long findcn, u_long *frcnp, u_long *fsrcnp) { int i; u_long cn; struct fatcache *closest = NULL; ASSERT_VOP_LOCKED(DETOV(dep), "fc_lookup"); for (i = 0; i < FC_SIZE; i++) { cn = dep->de_fc[i].fc_frcn; if (cn != FCE_EMPTY && cn <= findcn) { if (closest == NULL || cn > closest->fc_frcn) closest = &dep->de_fc[i]; } } if (closest) { *frcnp = closest->fc_frcn; *fsrcnp = closest->fc_fsrcn; } } /* * Purge the FAT cache in denode dep of all entries relating to file * relative cluster frcn and beyond. */ void fc_purge(struct denode *dep, u_int frcn) { int i; struct fatcache *fcp; ASSERT_VOP_ELOCKED(DETOV(dep), "fc_purge"); fcp = dep->de_fc; for (i = 0; i < FC_SIZE; i++, fcp++) { if (fcp->fc_frcn >= frcn) fcp->fc_frcn = FCE_EMPTY; } } /* * Update the FAT. * If mirroring the FAT, update all copies, with the first copy as last. * Else update only the current FAT (ignoring the others). * * pmp - msdosfsmount structure for filesystem to update * bp - addr of modified FAT block * fatbn - block number relative to begin of filesystem of the modified FAT block. */ static void updatefats(struct msdosfsmount *pmp, struct buf *bp, u_long fatbn) { struct buf *bpn; int cleanfat, i; #ifdef MSDOSFS_DEBUG printf("updatefats(pmp %p, bp %p, fatbn %lu)\n", pmp, bp, fatbn); #endif if (pmp->pm_flags & MSDOSFS_FATMIRROR) { /* * Now copy the block(s) of the modified FAT to the other copies of * the FAT and write them out. This is faster than reading in the * other FATs and then writing them back out. This could tie up * the FAT for quite a while. Preventing others from accessing it. * To prevent us from going after the FAT quite so much we use * delayed writes, unless they specified "synchronous" when the * filesystem was mounted. If synch is asked for then use * bwrite()'s and really slow things down. */ if (fatbn != pmp->pm_fatblk || FAT12(pmp)) cleanfat = 0; else if (FAT16(pmp)) cleanfat = 16; else cleanfat = 32; for (i = 1; i < pmp->pm_FATs; i++) { fatbn += pmp->pm_FATsecs; /* getblk() never fails */ bpn = getblk(pmp->pm_devvp, fatbn, bp->b_bcount, 0, 0, 0); memcpy(bpn->b_data, bp->b_data, bp->b_bcount); /* Force the clean bit on in the other copies. */ if (cleanfat == 16) ((uint8_t *)bpn->b_data)[3] |= 0x80; else if (cleanfat == 32) ((uint8_t *)bpn->b_data)[7] |= 0x08; if (pmp->pm_mountp->mnt_flag & MNT_SYNCHRONOUS) bwrite(bpn); else bdwrite(bpn); } } /* * Write out the first (or current) FAT last. */ if (pmp->pm_mountp->mnt_flag & MNT_SYNCHRONOUS) bwrite(bp); else bdwrite(bp); } /* * Updating entries in 12 bit FATs is a pain in the butt. * * The following picture shows where nibbles go when moving from a 12 bit * cluster number into the appropriate bytes in the FAT. * * byte m byte m+1 byte m+2 * +----+----+ +----+----+ +----+----+ * | 0 1 | | 2 3 | | 4 5 | FAT bytes * +----+----+ +----+----+ +----+----+ * * +----+----+----+ +----+----+----+ * | 3 0 1 | | 4 5 2 | * +----+----+----+ +----+----+----+ * cluster n cluster n+1 * * Where n is even. m = n + (n >> 2) * */ static __inline void usemap_alloc(struct msdosfsmount *pmp, u_long cn) { MSDOSFS_ASSERT_MP_LOCKED(pmp); KASSERT(cn <= pmp->pm_maxcluster, ("cn too large %lu %lu", cn, pmp->pm_maxcluster)); KASSERT((pmp->pm_flags & MSDOSFSMNT_RONLY) == 0, ("usemap_alloc on ro msdosfs mount")); KASSERT((pmp->pm_inusemap[cn / N_INUSEBITS] & (1 << (cn % N_INUSEBITS))) == 0, ("Allocating used sector %ld %ld %x", cn, cn % N_INUSEBITS, (unsigned)pmp->pm_inusemap[cn / N_INUSEBITS])); pmp->pm_inusemap[cn / N_INUSEBITS] |= 1U << (cn % N_INUSEBITS); KASSERT(pmp->pm_freeclustercount > 0, ("usemap_alloc: too little")); pmp->pm_freeclustercount--; pmp->pm_flags |= MSDOSFS_FSIMOD; } static __inline void usemap_free(struct msdosfsmount *pmp, u_long cn) { MSDOSFS_ASSERT_MP_LOCKED(pmp); KASSERT(cn <= pmp->pm_maxcluster, ("cn too large %lu %lu", cn, pmp->pm_maxcluster)); KASSERT((pmp->pm_flags & MSDOSFSMNT_RONLY) == 0, ("usemap_free on ro msdosfs mount")); pmp->pm_freeclustercount++; pmp->pm_flags |= MSDOSFS_FSIMOD; KASSERT((pmp->pm_inusemap[cn / N_INUSEBITS] & (1 << (cn % N_INUSEBITS))) != 0, ("Freeing unused sector %ld %ld %x", cn, cn % N_INUSEBITS, (unsigned)pmp->pm_inusemap[cn / N_INUSEBITS])); pmp->pm_inusemap[cn / N_INUSEBITS] &= ~(1U << (cn % N_INUSEBITS)); } int clusterfree(struct msdosfsmount *pmp, u_long cluster, u_long *oldcnp) { int error; u_long oldcn; error = fatentry(FAT_GET_AND_SET, pmp, cluster, &oldcn, MSDOSFSFREE); if (error) return (error); /* * If the cluster was successfully marked free, then update * the count of free clusters, and turn off the "allocated" * bit in the "in use" cluster bit map. */ MSDOSFS_LOCK_MP(pmp); usemap_free(pmp, cluster); MSDOSFS_UNLOCK_MP(pmp); if (oldcnp) *oldcnp = oldcn; return (0); } /* * Get or Set or 'Get and Set' the cluster'th entry in the FAT. * * function - whether to get or set a FAT entry * pmp - address of the msdosfsmount structure for the filesystem * whose FAT is to be manipulated. * cn - which cluster is of interest * oldcontents - address of a word that is to receive the contents of the * cluster'th entry if this is a get function * newcontents - the new value to be written into the cluster'th element of * the FAT if this is a set function. * * This function can also be used to free a cluster by setting the FAT entry * for a cluster to 0. * * All copies of the FAT are updated if this is a set function. NOTE: If * fatentry() marks a cluster as free it does not update the inusemap in * the msdosfsmount structure. This is left to the caller. */ int fatentry(int function, struct msdosfsmount *pmp, u_long cn, u_long *oldcontents, u_long newcontents) { int error; u_long readcn; u_long bn, bo, bsize, byteoffset; struct buf *bp; #ifdef MSDOSFS_DEBUG printf("fatentry(func %d, pmp %p, clust %lu, oldcon %p, newcon %lx)\n", function, pmp, cn, oldcontents, newcontents); #endif #ifdef DIAGNOSTIC /* * Be sure they asked us to do something. */ if ((function & (FAT_SET | FAT_GET)) == 0) { #ifdef MSDOSFS_DEBUG printf("fatentry(): function code doesn't specify get or set\n"); #endif return (EINVAL); } /* * If they asked us to return a cluster number but didn't tell us * where to put it, give them an error. */ if ((function & FAT_GET) && oldcontents == NULL) { #ifdef MSDOSFS_DEBUG printf("fatentry(): get function with no place to put result\n"); #endif return (EINVAL); } #endif /* * Be sure the requested cluster is in the filesystem. */ if (cn < CLUST_FIRST || cn > pmp->pm_maxcluster) return (EINVAL); byteoffset = FATOFS(pmp, cn); fatblock(pmp, byteoffset, &bn, &bsize, &bo); error = bread(pmp->pm_devvp, bn, bsize, NOCRED, &bp); if (error) { return (error); } if (function & FAT_GET) { if (FAT32(pmp)) readcn = getulong(bp->b_data + bo); else readcn = getushort(bp->b_data + bo); if (FAT12(pmp) & (cn & 1)) readcn >>= 4; readcn &= pmp->pm_fatmask; /* map reserved FAT entries to same values for all FATs */ if ((readcn | ~pmp->pm_fatmask) >= CLUST_RSRVD) readcn |= ~pmp->pm_fatmask; *oldcontents = readcn; } if (function & FAT_SET) { switch (pmp->pm_fatmask) { case FAT12_MASK: readcn = getushort(bp->b_data + bo); if (cn & 1) { readcn &= 0x000f; readcn |= newcontents << 4; } else { readcn &= 0xf000; readcn |= newcontents & 0xfff; } putushort(bp->b_data + bo, readcn); break; case FAT16_MASK: putushort(bp->b_data + bo, newcontents); break; case FAT32_MASK: /* * According to spec we have to retain the * high order bits of the FAT entry. */ readcn = getulong(bp->b_data + bo); readcn &= ~FAT32_MASK; readcn |= newcontents & FAT32_MASK; putulong(bp->b_data + bo, readcn); break; } updatefats(pmp, bp, bn); bp = NULL; pmp->pm_fmod = 1; } if (bp) brelse(bp); return (0); } /* * Update a contiguous cluster chain * * pmp - mount point * start - first cluster of chain * count - number of clusters in chain * fillwith - what to write into FAT entry of last cluster */ static int fatchain(struct msdosfsmount *pmp, u_long start, u_long count, u_long fillwith) { int error; u_long bn, bo, bsize, byteoffset, readcn, newc; struct buf *bp; #ifdef MSDOSFS_DEBUG printf("fatchain(pmp %p, start %lu, count %lu, fillwith %lx)\n", pmp, start, count, fillwith); #endif /* * Be sure the clusters are in the filesystem. */ if (start < CLUST_FIRST || start + count - 1 > pmp->pm_maxcluster) return (EINVAL); while (count > 0) { byteoffset = FATOFS(pmp, start); fatblock(pmp, byteoffset, &bn, &bsize, &bo); error = bread(pmp->pm_devvp, bn, bsize, NOCRED, &bp); if (error) { return (error); } while (count > 0) { start++; newc = --count > 0 ? start : fillwith; switch (pmp->pm_fatmask) { case FAT12_MASK: readcn = getushort(bp->b_data + bo); if (start & 1) { readcn &= 0xf000; readcn |= newc & 0xfff; } else { readcn &= 0x000f; readcn |= newc << 4; } putushort(bp->b_data + bo, readcn); bo++; if (!(start & 1)) bo++; break; case FAT16_MASK: putushort(bp->b_data + bo, newc); bo += 2; break; case FAT32_MASK: readcn = getulong(bp->b_data + bo); readcn &= ~pmp->pm_fatmask; readcn |= newc & pmp->pm_fatmask; putulong(bp->b_data + bo, readcn); bo += 4; break; } if (bo >= bsize) break; } updatefats(pmp, bp, bn); } pmp->pm_fmod = 1; return (0); } /* * Check the length of a free cluster chain starting at start. * * pmp - mount point * start - start of chain * count - maximum interesting length */ static int chainlength(struct msdosfsmount *pmp, u_long start, u_long count) { u_long idx, max_idx; u_int map; u_long len; MSDOSFS_ASSERT_MP_LOCKED(pmp); if (start > pmp->pm_maxcluster) return (0); max_idx = pmp->pm_maxcluster / N_INUSEBITS; idx = start / N_INUSEBITS; start %= N_INUSEBITS; map = pmp->pm_inusemap[idx]; map &= ~((1 << start) - 1); if (map) { len = ffs(map) - 1 - start; len = MIN(len, count); if (start + len > pmp->pm_maxcluster) len = pmp->pm_maxcluster - start + 1; return (len); } len = N_INUSEBITS - start; if (len >= count) { len = count; if (start + len > pmp->pm_maxcluster) len = pmp->pm_maxcluster - start + 1; return (len); } while (++idx <= max_idx) { if (len >= count) break; map = pmp->pm_inusemap[idx]; if (map) { len += ffs(map) - 1; break; } len += N_INUSEBITS; } len = MIN(len, count); if (start + len > pmp->pm_maxcluster) len = pmp->pm_maxcluster - start + 1; return (len); } /* * Allocate contigous free clusters. * * pmp - mount point. * start - start of cluster chain. * count - number of clusters to allocate. * fillwith - put this value into the FAT entry for the * last allocated cluster. * retcluster - put the first allocated cluster's number here. * got - how many clusters were actually allocated. */ static int chainalloc(struct msdosfsmount *pmp, u_long start, u_long count, u_long fillwith, u_long *retcluster, u_long *got) { int error; u_long cl, n; MSDOSFS_ASSERT_MP_LOCKED(pmp); KASSERT((pmp->pm_flags & MSDOSFSMNT_RONLY) == 0, ("chainalloc on ro msdosfs mount")); for (cl = start, n = count; n-- > 0;) usemap_alloc(pmp, cl++); pmp->pm_nxtfree = start + count; if (pmp->pm_nxtfree > pmp->pm_maxcluster) pmp->pm_nxtfree = CLUST_FIRST; pmp->pm_flags |= MSDOSFS_FSIMOD; error = fatchain(pmp, start, count, fillwith); if (error != 0) { for (cl = start, n = count; n-- > 0;) usemap_free(pmp, cl++); return (error); } #ifdef MSDOSFS_DEBUG printf("clusteralloc(): allocated cluster chain at %lu (%lu clusters)\n", start, count); #endif if (retcluster) *retcluster = start; if (got) *got = count; return (0); } /* * Allocate contiguous free clusters. * * pmp - mount point. * start - preferred start of cluster chain. * count - number of clusters requested. * fillwith - put this value into the FAT entry for the * last allocated cluster. * retcluster - put the first allocated cluster's number here. * got - how many clusters were actually allocated. */ int clusteralloc(struct msdosfsmount *pmp, u_long start, u_long count, u_long fillwith, u_long *retcluster, u_long *got) { int error; MSDOSFS_LOCK_MP(pmp); error = clusteralloc1(pmp, start, count, fillwith, retcluster, got); MSDOSFS_UNLOCK_MP(pmp); return (error); } static int clusteralloc1(struct msdosfsmount *pmp, u_long start, u_long count, u_long fillwith, u_long *retcluster, u_long *got) { u_long idx; u_long len, newst, foundl, cn, l; u_long foundcn = 0; /* XXX: foundcn could be used unititialized */ u_int map; MSDOSFS_ASSERT_MP_LOCKED(pmp); #ifdef MSDOSFS_DEBUG printf("clusteralloc(): find %lu clusters\n", count); #endif if (start) { if ((len = chainlength(pmp, start, count)) >= count) return (chainalloc(pmp, start, count, fillwith, retcluster, got)); } else len = 0; newst = pmp->pm_nxtfree; foundl = 0; for (cn = newst; cn <= pmp->pm_maxcluster;) { idx = cn / N_INUSEBITS; map = pmp->pm_inusemap[idx]; map |= (1U << (cn % N_INUSEBITS)) - 1; if (map != FULL_RUN) { cn = idx * N_INUSEBITS + ffs(map ^ FULL_RUN) - 1; if ((l = chainlength(pmp, cn, count)) >= count) return (chainalloc(pmp, cn, count, fillwith, retcluster, got)); if (l > foundl) { foundcn = cn; foundl = l; } cn += l + 1; continue; } cn += N_INUSEBITS - cn % N_INUSEBITS; } for (cn = 0; cn < newst;) { idx = cn / N_INUSEBITS; map = pmp->pm_inusemap[idx]; map |= (1U << (cn % N_INUSEBITS)) - 1; if (map != FULL_RUN) { cn = idx * N_INUSEBITS + ffs(map ^ FULL_RUN) - 1; if ((l = chainlength(pmp, cn, count)) >= count) return (chainalloc(pmp, cn, count, fillwith, retcluster, got)); if (l > foundl) { foundcn = cn; foundl = l; } cn += l + 1; continue; } cn += N_INUSEBITS - cn % N_INUSEBITS; } if (!foundl) return (ENOSPC); if (len) return (chainalloc(pmp, start, len, fillwith, retcluster, got)); else return (chainalloc(pmp, foundcn, foundl, fillwith, retcluster, got)); } /* * Free a chain of clusters. * * pmp - address of the msdosfs mount structure for the filesystem * containing the cluster chain to be freed. * startcluster - number of the 1st cluster in the chain of clusters to be * freed. */ int freeclusterchain(struct msdosfsmount *pmp, u_long cluster) { int error; struct buf *bp = NULL; u_long bn, bo, bsize, byteoffset; u_long readcn, lbn = -1; MSDOSFS_LOCK_MP(pmp); while (cluster >= CLUST_FIRST && cluster <= pmp->pm_maxcluster) { byteoffset = FATOFS(pmp, cluster); fatblock(pmp, byteoffset, &bn, &bsize, &bo); if (lbn != bn) { if (bp) updatefats(pmp, bp, lbn); error = bread(pmp->pm_devvp, bn, bsize, NOCRED, &bp); if (error) { MSDOSFS_UNLOCK_MP(pmp); return (error); } lbn = bn; } usemap_free(pmp, cluster); switch (pmp->pm_fatmask) { case FAT12_MASK: readcn = getushort(bp->b_data + bo); if (cluster & 1) { cluster = readcn >> 4; readcn &= 0x000f; readcn |= MSDOSFSFREE << 4; } else { cluster = readcn; readcn &= 0xf000; readcn |= MSDOSFSFREE & 0xfff; } putushort(bp->b_data + bo, readcn); break; case FAT16_MASK: cluster = getushort(bp->b_data + bo); putushort(bp->b_data + bo, MSDOSFSFREE); break; case FAT32_MASK: cluster = getulong(bp->b_data + bo); putulong(bp->b_data + bo, (MSDOSFSFREE & FAT32_MASK) | (cluster & ~FAT32_MASK)); break; } cluster &= pmp->pm_fatmask; if ((cluster | ~pmp->pm_fatmask) >= CLUST_RSRVD) cluster |= pmp->pm_fatmask; } if (bp) updatefats(pmp, bp, bn); MSDOSFS_UNLOCK_MP(pmp); return (0); } /* * Read in FAT blocks looking for free clusters. For every free cluster * found turn off its corresponding bit in the pm_inusemap. */ int fillinusemap(struct msdosfsmount *pmp) { struct buf *bp; u_long bn, bo, bsize, byteoffset, cn, readcn; int error; MSDOSFS_ASSERT_MP_LOCKED(pmp); bp = NULL; /* * Mark all clusters in use, we mark the free ones in the FAT scan * loop further down. */ for (cn = 0; cn < (pmp->pm_maxcluster + N_INUSEBITS) / N_INUSEBITS; cn++) pmp->pm_inusemap[cn] = FULL_RUN; /* * Figure how many free clusters are in the filesystem by ripping * through the FAT counting the number of entries whose content is * zero. These represent free clusters. */ pmp->pm_freeclustercount = 0; for (cn = 0; cn <= pmp->pm_maxcluster; cn++) { byteoffset = FATOFS(pmp, cn); bo = byteoffset % pmp->pm_fatblocksize; if (bo == 0) { /* Read new FAT block */ if (bp != NULL) brelse(bp); fatblock(pmp, byteoffset, &bn, &bsize, NULL); error = bread(pmp->pm_devvp, bn, bsize, NOCRED, &bp); if (error != 0) return (error); } if (FAT32(pmp)) readcn = getulong(bp->b_data + bo); else readcn = getushort(bp->b_data + bo); if (FAT12(pmp) && (cn & 1)) readcn >>= 4; readcn &= pmp->pm_fatmask; /* * Check if the FAT ID matches the BPB's media descriptor and * all other bits are set to 1. */ if (cn == 0 && readcn != ((pmp->pm_fatmask & 0xffffff00) | pmp->pm_bpb.bpbMedia)) { #ifdef MSDOSFS_DEBUG printf("mountmsdosfs(): Media descriptor in BPB" "does not match FAT ID\n"); #endif brelse(bp); return (EINVAL); } else if (readcn == CLUST_FREE) usemap_free(pmp, cn); } if (bp != NULL) brelse(bp); for (cn = pmp->pm_maxcluster + 1; cn < (pmp->pm_maxcluster + N_INUSEBITS) / N_INUSEBITS; cn++) pmp->pm_inusemap[cn / N_INUSEBITS] |= 1U << (cn % N_INUSEBITS); return (0); } /* * Allocate a new cluster and chain it onto the end of the file. * * dep - the file to extend * count - number of clusters to allocate * bpp - where to return the address of the buf header for the first new * file block * ncp - where to put cluster number of the first newly allocated cluster * If this pointer is 0, do not return the cluster number. * flags - see fat.h * * NOTE: This function is not responsible for turning on the DE_UPDATE bit of * the de_flag field of the denode and it does not change the de_FileSize * field. This is left for the caller to do. */ int extendfile(struct denode *dep, u_long count, struct buf **bpp, u_long *ncp, int flags) { int error; u_long frcn; u_long cn, got; struct msdosfsmount *pmp = dep->de_pmp; struct buf *bp; daddr_t blkno; /* * Don't try to extend the root directory */ if (dep->de_StartCluster == MSDOSFSROOT && (dep->de_Attributes & ATTR_DIRECTORY)) { #ifdef MSDOSFS_DEBUG printf("extendfile(): attempt to extend root directory\n"); #endif return (ENOSPC); } /* * If the "file's last cluster" cache entry is empty, and the file * is not empty, then fill the cache entry by calling pcbmap(). */ if (dep->de_fc[FC_LASTFC].fc_frcn == FCE_EMPTY && dep->de_StartCluster != 0) { error = pcbmap(dep, 0xffff, 0, &cn, 0); /* we expect it to return E2BIG */ if (error != E2BIG) return (error); } dep->de_fc[FC_NEXTTOLASTFC].fc_frcn = dep->de_fc[FC_LASTFC].fc_frcn; dep->de_fc[FC_NEXTTOLASTFC].fc_fsrcn = dep->de_fc[FC_LASTFC].fc_fsrcn; while (count > 0) { /* * Allocate a new cluster chain and cat onto the end of the * file. If the file is empty we make de_StartCluster point * to the new block. Note that de_StartCluster being 0 is * sufficient to be sure the file is empty since we exclude * attempts to extend the root directory above, and the root * dir is the only file with a startcluster of 0 that has * blocks allocated (sort of). */ if (dep->de_StartCluster == 0) cn = 0; else cn = dep->de_fc[FC_LASTFC].fc_fsrcn + 1; error = clusteralloc(pmp, cn, count, CLUST_EOFE, &cn, &got); if (error) return (error); count -= got; /* * Give them the filesystem relative cluster number if they want * it. */ if (ncp) { *ncp = cn; ncp = NULL; } if (dep->de_StartCluster == 0) { dep->de_StartCluster = cn; frcn = 0; } else { error = fatentry(FAT_SET, pmp, dep->de_fc[FC_LASTFC].fc_fsrcn, 0, cn); if (error) { clusterfree(pmp, cn, NULL); return (error); } frcn = dep->de_fc[FC_LASTFC].fc_frcn + 1; } /* * Update the "last cluster of the file" entry in the * denode's FAT cache. */ fc_setcache(dep, FC_LASTFC, frcn + got - 1, cn + got - 1); if (flags & DE_CLEAR) { while (got-- > 0) { /* * Get the buf header for the new block of the file. */ if (dep->de_Attributes & ATTR_DIRECTORY) bp = getblk(pmp->pm_devvp, cntobn(pmp, cn++), pmp->pm_bpcluster, 0, 0, 0); else { bp = getblk(DETOV(dep), frcn++, pmp->pm_bpcluster, 0, 0, 0); /* * Do the bmap now, as in msdosfs_write */ if (pcbmap(dep, bp->b_lblkno, &blkno, 0, 0)) bp->b_blkno = -1; if (bp->b_blkno == -1) panic("extendfile: pcbmap"); else bp->b_blkno = blkno; } clrbuf(bp); if (bpp) { *bpp = bp; bpp = NULL; } else { bdwrite(bp); } if (vm_page_count_severe() || buf_dirty_count_severe()) vn_fsync_buf(DETOV(dep), MNT_WAIT); } } } return (0); } /*- * Routine to mark a FAT16 or FAT32 volume as "clean" or "dirty" by * manipulating the upper bit of the FAT entry for cluster 1. Note that * this bit is not defined for FAT12 volumes, which are always assumed to * be clean. * * The fatentry() routine only works on cluster numbers that a file could * occupy, so it won't manipulate the entry for cluster 1. So we have to do * it here. The code was stolen from fatentry() and tailored for cluster 1. * * Inputs: * pmp The MS-DOS volume to mark * dirty Non-zero if the volume should be marked dirty; zero if it * should be marked clean * * Result: * 0 Success * EROFS Volume is read-only * ? (other errors from called routines) */ int -markvoldirty(struct msdosfsmount *pmp, int dirty) +markvoldirty_upgrade(struct msdosfsmount *pmp, bool dirty, bool rw_upgrade) { struct buf *bp; u_long bn, bo, bsize, byteoffset, fatval; int error; /* * FAT12 does not support a "clean" bit, so don't do anything for * FAT12. */ if (FAT12(pmp)) return (0); - /* Can't change the bit on a read-only filesystem. */ - if (pmp->pm_flags & MSDOSFSMNT_RONLY) + /* + * Can't change the bit on a read-only filesystem, except as part of + * ro->rw upgrade. + */ + if ((pmp->pm_flags & MSDOSFSMNT_RONLY) != 0 && !rw_upgrade) return (EROFS); /* * Fetch the block containing the FAT entry. It is given by the * pseudo-cluster 1. */ byteoffset = FATOFS(pmp, 1); fatblock(pmp, byteoffset, &bn, &bsize, &bo); error = bread(pmp->pm_devvp, bn, bsize, NOCRED, &bp); if (error) return (error); /* * Get the current value of the FAT entry and set/clear the relevant * bit. Dirty means clear the "clean" bit; clean means set the * "clean" bit. */ if (FAT32(pmp)) { /* FAT32 uses bit 27. */ fatval = getulong(&bp->b_data[bo]); if (dirty) fatval &= 0xF7FFFFFF; else fatval |= 0x08000000; putulong(&bp->b_data[bo], fatval); } else { /* Must be FAT16; use bit 15. */ fatval = getushort(&bp->b_data[bo]); if (dirty) fatval &= 0x7FFF; else fatval |= 0x8000; putushort(&bp->b_data[bo], fatval); } + + /* + * The concern here is that a devvp may be readonly, without reporting + * itself as such through the usual channels. In that case, we'd like + * it if attempting to mount msdosfs rw didn't panic the system. + * + * markvoldirty is invoked as the first write on backing devvps when + * either msdosfs is mounted for the first time, or a ro mount is + * upgraded to rw. + * + * In either event, if a write error occurs dirtying the volume: + * - No user data has been permitted to be written to cache yet. + * - We can abort the high-level operation (mount, or ro->rw) safely. + * - We don't derive any benefit from leaving a zombie dirty buf in + * the cache that can not be cleaned or evicted. + * + * So, mark B_INVALONERR to have bwrite() -> brelse() detect that + * condition and force-invalidate our write to the block if it occurs. + * + * PR 210316 provides more context on the discovery and diagnosis of + * the problem, as well as earlier attempts to solve it. + */ + bp->b_flags |= B_INVALONERR; /* Write out the modified FAT block synchronously. */ return (bwrite(bp)); } Index: head/sys/fs/msdosfs/msdosfs_vfsops.c =================================================================== --- head/sys/fs/msdosfs/msdosfs_vfsops.c (revision 352232) +++ head/sys/fs/msdosfs/msdosfs_vfsops.c (revision 352233) @@ -1,978 +1,985 @@ /* $FreeBSD$ */ /* $NetBSD: msdosfs_vfsops.c,v 1.51 1997/11/17 15:36:58 ws Exp $ */ /*- * SPDX-License-Identifier: BSD-4-Clause * * Copyright (C) 1994, 1995, 1997 Wolfgang Solfrank. * Copyright (C) 1994, 1995, 1997 TooLs GmbH. * All rights reserved. * Original code by Paul Popelka (paulp@uts.amdahl.com) (see below). * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by TooLs GmbH. * 4. The name of TooLs GmbH may not be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY TOOLS GMBH ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL TOOLS GMBH BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /*- * Written by Paul Popelka (paulp@uts.amdahl.com) * * You can do anything you want with this software, just don't say you wrote * it, and don't remove this notice. * * This software is provided "as is". * * The author supplies this software to be publicly redistributed on the * understanding that the author is not responsible for the correct * functioning of this software in any circumstances and is not liable for * any damages caused by this software. * * October 1992 */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef MSDOSFS_DEBUG #include #endif static const char msdosfs_lock_msg[] = "fatlk"; /* Mount options that we support. */ static const char *msdosfs_opts[] = { "async", "noatime", "noclusterr", "noclusterw", "export", "force", "from", "sync", "cs_dos", "cs_local", "cs_win", "dirmask", "gid", "kiconv", "longname", "longnames", "mask", "shortname", "shortnames", "uid", "win95", "nowin95", NULL }; #if 1 /*def PC98*/ /* * XXX - The boot signature formatted by NEC PC-98 DOS looks like a * garbage or a random value :-{ * If you want to use that broken-signatured media, define the * following symbol even though PC/AT. * (ex. mount PC-98 DOS formatted FD on PC/AT) */ #define MSDOSFS_NOCHECKSIG #endif MALLOC_DEFINE(M_MSDOSFSMNT, "msdosfs_mount", "MSDOSFS mount structure"); static MALLOC_DEFINE(M_MSDOSFSFAT, "msdosfs_fat", "MSDOSFS file allocation table"); struct iconv_functions *msdosfs_iconv; static int update_mp(struct mount *mp, struct thread *td); static int mountmsdosfs(struct vnode *devvp, struct mount *mp); static vfs_fhtovp_t msdosfs_fhtovp; static vfs_mount_t msdosfs_mount; static vfs_root_t msdosfs_root; static vfs_statfs_t msdosfs_statfs; static vfs_sync_t msdosfs_sync; static vfs_unmount_t msdosfs_unmount; /* Maximum length of a character set name (arbitrary). */ #define MAXCSLEN 64 static int update_mp(struct mount *mp, struct thread *td) { struct msdosfsmount *pmp = VFSTOMSDOSFS(mp); void *dos, *win, *local; int error, v; if (!vfs_getopt(mp->mnt_optnew, "kiconv", NULL, NULL)) { if (msdosfs_iconv != NULL) { error = vfs_getopt(mp->mnt_optnew, "cs_win", &win, NULL); if (!error) error = vfs_getopt(mp->mnt_optnew, "cs_local", &local, NULL); if (!error) error = vfs_getopt(mp->mnt_optnew, "cs_dos", &dos, NULL); if (!error) { msdosfs_iconv->open(win, local, &pmp->pm_u2w); msdosfs_iconv->open(local, win, &pmp->pm_w2u); msdosfs_iconv->open(dos, local, &pmp->pm_u2d); msdosfs_iconv->open(local, dos, &pmp->pm_d2u); } if (error != 0) return (error); } else { pmp->pm_w2u = NULL; pmp->pm_u2w = NULL; pmp->pm_d2u = NULL; pmp->pm_u2d = NULL; } } if (vfs_scanopt(mp->mnt_optnew, "gid", "%d", &v) == 1) pmp->pm_gid = v; if (vfs_scanopt(mp->mnt_optnew, "uid", "%d", &v) == 1) pmp->pm_uid = v; if (vfs_scanopt(mp->mnt_optnew, "mask", "%d", &v) == 1) pmp->pm_mask = v & ALLPERMS; if (vfs_scanopt(mp->mnt_optnew, "dirmask", "%d", &v) == 1) pmp->pm_dirmask = v & ALLPERMS; vfs_flagopt(mp->mnt_optnew, "shortname", &pmp->pm_flags, MSDOSFSMNT_SHORTNAME); vfs_flagopt(mp->mnt_optnew, "shortnames", &pmp->pm_flags, MSDOSFSMNT_SHORTNAME); vfs_flagopt(mp->mnt_optnew, "longname", &pmp->pm_flags, MSDOSFSMNT_LONGNAME); vfs_flagopt(mp->mnt_optnew, "longnames", &pmp->pm_flags, MSDOSFSMNT_LONGNAME); vfs_flagopt(mp->mnt_optnew, "kiconv", &pmp->pm_flags, MSDOSFSMNT_KICONV); if (vfs_getopt(mp->mnt_optnew, "nowin95", NULL, NULL) == 0) pmp->pm_flags |= MSDOSFSMNT_NOWIN95; else pmp->pm_flags &= ~MSDOSFSMNT_NOWIN95; if (pmp->pm_flags & MSDOSFSMNT_NOWIN95) pmp->pm_flags |= MSDOSFSMNT_SHORTNAME; else pmp->pm_flags |= MSDOSFSMNT_LONGNAME; return 0; } static int msdosfs_cmount(struct mntarg *ma, void *data, uint64_t flags) { struct msdosfs_args args; struct export_args exp; int error; if (data == NULL) return (EINVAL); error = copyin(data, &args, sizeof args); if (error) return (error); vfs_oexport_conv(&args.export, &exp); ma = mount_argsu(ma, "from", args.fspec, MAXPATHLEN); ma = mount_arg(ma, "export", &exp, sizeof(exp)); ma = mount_argf(ma, "uid", "%d", args.uid); ma = mount_argf(ma, "gid", "%d", args.gid); ma = mount_argf(ma, "mask", "%d", args.mask); ma = mount_argf(ma, "dirmask", "%d", args.dirmask); ma = mount_argb(ma, args.flags & MSDOSFSMNT_SHORTNAME, "noshortname"); ma = mount_argb(ma, args.flags & MSDOSFSMNT_LONGNAME, "nolongname"); ma = mount_argb(ma, !(args.flags & MSDOSFSMNT_NOWIN95), "nowin95"); ma = mount_argb(ma, args.flags & MSDOSFSMNT_KICONV, "nokiconv"); ma = mount_argsu(ma, "cs_win", args.cs_win, MAXCSLEN); ma = mount_argsu(ma, "cs_dos", args.cs_dos, MAXCSLEN); ma = mount_argsu(ma, "cs_local", args.cs_local, MAXCSLEN); error = kernel_mount(ma, flags); return (error); } /* * mp - path - addr in user space of mount point (ie /usr or whatever) * data - addr in user space of mount params including the name of the block * special file to treat as a filesystem. */ static int msdosfs_mount(struct mount *mp) { struct vnode *devvp; /* vnode for blk device to mount */ struct thread *td; /* msdosfs specific mount control block */ struct msdosfsmount *pmp = NULL; struct nameidata ndp; int error, flags; accmode_t accmode; char *from; td = curthread; if (vfs_filteropt(mp->mnt_optnew, msdosfs_opts)) return (EINVAL); /* * If updating, check whether changing from read-only to * read/write; if there is no device name, that's all we do. */ if (mp->mnt_flag & MNT_UPDATE) { pmp = VFSTOMSDOSFS(mp); if (!(pmp->pm_flags & MSDOSFSMNT_RONLY) && vfs_flagopt(mp->mnt_optnew, "ro", NULL, 0)) { error = VFS_SYNC(mp, MNT_WAIT); if (error) return (error); flags = WRITECLOSE; if (mp->mnt_flag & MNT_FORCE) flags |= FORCECLOSE; error = vflush(mp, 0, flags, td); if (error) return (error); /* * Now the volume is clean. Mark it so while the * device is still rw. */ error = markvoldirty(pmp, 0); if (error) { (void)markvoldirty(pmp, 1); return (error); } /* Downgrade the device from rw to ro. */ g_topology_lock(); error = g_access(pmp->pm_cp, 0, -1, 0); g_topology_unlock(); if (error) { (void)markvoldirty(pmp, 1); return (error); } /* * Backing out after an error was painful in the * above. Now we are committed to succeeding. */ pmp->pm_fmod = 0; pmp->pm_flags |= MSDOSFSMNT_RONLY; MNT_ILOCK(mp); mp->mnt_flag |= MNT_RDONLY; MNT_IUNLOCK(mp); } else if ((pmp->pm_flags & MSDOSFSMNT_RONLY) && !vfs_flagopt(mp->mnt_optnew, "ro", NULL, 0)) { /* * If upgrade to read-write by non-root, then verify * that user has necessary permissions on the device. */ devvp = pmp->pm_devvp; vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY); error = VOP_ACCESS(devvp, VREAD | VWRITE, td->td_ucred, td); if (error) error = priv_check(td, PRIV_VFS_MOUNT_PERM); if (error) { VOP_UNLOCK(devvp, 0); return (error); } VOP_UNLOCK(devvp, 0); g_topology_lock(); error = g_access(pmp->pm_cp, 0, 1, 0); g_topology_unlock(); if (error) return (error); + /* Now that the volume is modifiable, mark it dirty. */ + error = markvoldirty_upgrade(pmp, true, true); + if (error) { + /* + * If dirtying the superblock failed, drop GEOM + * 'w' refs (we're still RO). + */ + g_topology_lock(); + (void)g_access(pmp->pm_cp, 0, -1, 0); + g_topology_unlock(); + + return (error); + } + pmp->pm_fmod = 1; pmp->pm_flags &= ~MSDOSFSMNT_RONLY; MNT_ILOCK(mp); mp->mnt_flag &= ~MNT_RDONLY; MNT_IUNLOCK(mp); - - /* Now that the volume is modifiable, mark it dirty. */ - error = markvoldirty(pmp, 1); - if (error) - return (error); } } /* * Not an update, or updating the name: look up the name * and verify that it refers to a sensible disk device. */ if (vfs_getopt(mp->mnt_optnew, "from", (void **)&from, NULL)) return (EINVAL); NDINIT(&ndp, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, from, td); error = namei(&ndp); if (error) return (error); devvp = ndp.ni_vp; NDFREE(&ndp, NDF_ONLY_PNBUF); if (!vn_isdisk(devvp, &error)) { vput(devvp); return (error); } /* * If mount by non-root, then verify that user has necessary * permissions on the device. */ accmode = VREAD; if ((mp->mnt_flag & MNT_RDONLY) == 0) accmode |= VWRITE; error = VOP_ACCESS(devvp, accmode, td->td_ucred, td); if (error) error = priv_check(td, PRIV_VFS_MOUNT_PERM); if (error) { vput(devvp); return (error); } if ((mp->mnt_flag & MNT_UPDATE) == 0) { error = mountmsdosfs(devvp, mp); #ifdef MSDOSFS_DEBUG /* only needed for the printf below */ pmp = VFSTOMSDOSFS(mp); #endif } else { vput(devvp); if (devvp != pmp->pm_devvp) return (EINVAL); /* XXX needs translation */ } if (error) { vrele(devvp); return (error); } error = update_mp(mp, td); if (error) { if ((mp->mnt_flag & MNT_UPDATE) == 0) msdosfs_unmount(mp, MNT_FORCE); return error; } vfs_mountedfrom(mp, from); #ifdef MSDOSFS_DEBUG printf("msdosfs_mount(): mp %p, pmp %p, inusemap %p\n", mp, pmp, pmp->pm_inusemap); #endif return (0); } static int mountmsdosfs(struct vnode *devvp, struct mount *mp) { struct msdosfsmount *pmp; struct buf *bp; struct cdev *dev; union bootsector *bsp; struct byte_bpb33 *b33; struct byte_bpb50 *b50; struct byte_bpb710 *b710; uint8_t SecPerClust; u_long clusters; int ronly, error; struct g_consumer *cp; struct bufobj *bo; bp = NULL; /* This and pmp both used in error_exit. */ pmp = NULL; ronly = (mp->mnt_flag & MNT_RDONLY) != 0; dev = devvp->v_rdev; if (atomic_cmpset_acq_ptr((uintptr_t *)&dev->si_mountpt, 0, (uintptr_t)mp) == 0) { VOP_UNLOCK(devvp, 0); return (EBUSY); } g_topology_lock(); error = g_vfs_open(devvp, &cp, "msdosfs", ronly ? 0 : 1); g_topology_unlock(); if (error != 0) { atomic_store_rel_ptr((uintptr_t *)&dev->si_mountpt, 0); VOP_UNLOCK(devvp, 0); return (error); } dev_ref(dev); bo = &devvp->v_bufobj; VOP_UNLOCK(devvp, 0); if (dev->si_iosize_max != 0) mp->mnt_iosize_max = dev->si_iosize_max; if (mp->mnt_iosize_max > MAXPHYS) mp->mnt_iosize_max = MAXPHYS; /* * Read the boot sector of the filesystem, and then check the * boot signature. If not a dos boot sector then error out. * * NOTE: 8192 is a magic size that works for ffs. */ error = bread(devvp, 0, 8192, NOCRED, &bp); if (error) goto error_exit; bp->b_flags |= B_AGE; bsp = (union bootsector *)bp->b_data; b33 = (struct byte_bpb33 *)bsp->bs33.bsBPB; b50 = (struct byte_bpb50 *)bsp->bs50.bsBPB; b710 = (struct byte_bpb710 *)bsp->bs710.bsBPB; #ifndef MSDOSFS_NOCHECKSIG if (bsp->bs50.bsBootSectSig0 != BOOTSIG0 || bsp->bs50.bsBootSectSig1 != BOOTSIG1) { error = EINVAL; goto error_exit; } #endif pmp = malloc(sizeof *pmp, M_MSDOSFSMNT, M_WAITOK | M_ZERO); pmp->pm_mountp = mp; pmp->pm_cp = cp; pmp->pm_bo = bo; lockinit(&pmp->pm_fatlock, 0, msdosfs_lock_msg, 0, 0); /* * Initialize ownerships and permissions, since nothing else will * initialize them iff we are mounting root. */ pmp->pm_uid = UID_ROOT; pmp->pm_gid = GID_WHEEL; pmp->pm_mask = pmp->pm_dirmask = S_IXUSR | S_IXGRP | S_IXOTH | S_IRUSR | S_IRGRP | S_IROTH | S_IWUSR; /* * Compute several useful quantities from the bpb in the * bootsector. Copy in the dos 5 variant of the bpb then fix up * the fields that are different between dos 5 and dos 3.3. */ SecPerClust = b50->bpbSecPerClust; pmp->pm_BytesPerSec = getushort(b50->bpbBytesPerSec); if (pmp->pm_BytesPerSec < DEV_BSIZE) { error = EINVAL; goto error_exit; } pmp->pm_ResSectors = getushort(b50->bpbResSectors); pmp->pm_FATs = b50->bpbFATs; pmp->pm_RootDirEnts = getushort(b50->bpbRootDirEnts); pmp->pm_Sectors = getushort(b50->bpbSectors); pmp->pm_FATsecs = getushort(b50->bpbFATsecs); pmp->pm_SecPerTrack = getushort(b50->bpbSecPerTrack); pmp->pm_Heads = getushort(b50->bpbHeads); pmp->pm_Media = b50->bpbMedia; /* calculate the ratio of sector size to DEV_BSIZE */ pmp->pm_BlkPerSec = pmp->pm_BytesPerSec / DEV_BSIZE; /* * We don't check pm_Heads nor pm_SecPerTrack, because * these may not be set for EFI file systems. We don't * use these anyway, so we're unaffected if they are * invalid. */ if (!pmp->pm_BytesPerSec || !SecPerClust) { error = EINVAL; goto error_exit; } if (pmp->pm_Sectors == 0) { pmp->pm_HiddenSects = getulong(b50->bpbHiddenSecs); pmp->pm_HugeSectors = getulong(b50->bpbHugeSectors); } else { pmp->pm_HiddenSects = getushort(b33->bpbHiddenSecs); pmp->pm_HugeSectors = pmp->pm_Sectors; } if (pmp->pm_RootDirEnts == 0) { if (pmp->pm_FATsecs || getushort(b710->bpbFSVers)) { error = EINVAL; #ifdef MSDOSFS_DEBUG printf("mountmsdosfs(): bad FAT32 filesystem\n"); #endif goto error_exit; } pmp->pm_fatmask = FAT32_MASK; pmp->pm_fatmult = 4; pmp->pm_fatdiv = 1; pmp->pm_FATsecs = getulong(b710->bpbBigFATsecs); if (getushort(b710->bpbExtFlags) & FATMIRROR) pmp->pm_curfat = getushort(b710->bpbExtFlags) & FATNUM; else pmp->pm_flags |= MSDOSFS_FATMIRROR; } else pmp->pm_flags |= MSDOSFS_FATMIRROR; /* * Check a few values (could do some more): * - logical sector size: power of 2, >= block size * - sectors per cluster: power of 2, >= 1 * - number of sectors: >= 1, <= size of partition * - number of FAT sectors: >= 1 */ if ( (SecPerClust == 0) || (SecPerClust & (SecPerClust - 1)) || (pmp->pm_BytesPerSec < DEV_BSIZE) || (pmp->pm_BytesPerSec & (pmp->pm_BytesPerSec - 1)) || (pmp->pm_HugeSectors == 0) || (pmp->pm_FATsecs == 0) || (SecPerClust * pmp->pm_BlkPerSec > MAXBSIZE / DEV_BSIZE) ) { error = EINVAL; goto error_exit; } pmp->pm_HugeSectors *= pmp->pm_BlkPerSec; pmp->pm_HiddenSects *= pmp->pm_BlkPerSec; /* XXX not used? */ pmp->pm_FATsecs *= pmp->pm_BlkPerSec; SecPerClust *= pmp->pm_BlkPerSec; pmp->pm_fatblk = pmp->pm_ResSectors * pmp->pm_BlkPerSec; if (FAT32(pmp)) { pmp->pm_rootdirblk = getulong(b710->bpbRootClust); pmp->pm_firstcluster = pmp->pm_fatblk + (pmp->pm_FATs * pmp->pm_FATsecs); pmp->pm_fsinfo = getushort(b710->bpbFSInfo) * pmp->pm_BlkPerSec; } else { pmp->pm_rootdirblk = pmp->pm_fatblk + (pmp->pm_FATs * pmp->pm_FATsecs); pmp->pm_rootdirsize = howmany(pmp->pm_RootDirEnts * sizeof(struct direntry), DEV_BSIZE); /* in blocks */ pmp->pm_firstcluster = pmp->pm_rootdirblk + pmp->pm_rootdirsize; } pmp->pm_maxcluster = (pmp->pm_HugeSectors - pmp->pm_firstcluster) / SecPerClust + 1; pmp->pm_fatsize = pmp->pm_FATsecs * DEV_BSIZE; /* XXX not used? */ if (pmp->pm_fatmask == 0) { if (pmp->pm_maxcluster <= ((CLUST_RSRVD - CLUST_FIRST) & FAT12_MASK)) { /* * This will usually be a floppy disk. This size makes * sure that one FAT entry will not be split across * multiple blocks. */ pmp->pm_fatmask = FAT12_MASK; pmp->pm_fatmult = 3; pmp->pm_fatdiv = 2; } else { pmp->pm_fatmask = FAT16_MASK; pmp->pm_fatmult = 2; pmp->pm_fatdiv = 1; } } clusters = (pmp->pm_fatsize / pmp->pm_fatmult) * pmp->pm_fatdiv; if (pmp->pm_maxcluster >= clusters) { #ifdef MSDOSFS_DEBUG printf("Warning: number of clusters (%ld) exceeds FAT " "capacity (%ld)\n", pmp->pm_maxcluster + 1, clusters); #endif pmp->pm_maxcluster = clusters - 1; } if (FAT12(pmp)) pmp->pm_fatblocksize = 3 * 512; else pmp->pm_fatblocksize = PAGE_SIZE; pmp->pm_fatblocksize = roundup(pmp->pm_fatblocksize, pmp->pm_BytesPerSec); pmp->pm_fatblocksec = pmp->pm_fatblocksize / DEV_BSIZE; pmp->pm_bnshift = ffs(DEV_BSIZE) - 1; /* * Compute mask and shift value for isolating cluster relative byte * offsets and cluster numbers from a file offset. */ pmp->pm_bpcluster = SecPerClust * DEV_BSIZE; pmp->pm_crbomask = pmp->pm_bpcluster - 1; pmp->pm_cnshift = ffs(pmp->pm_bpcluster) - 1; /* * Check for valid cluster size * must be a power of 2 */ if (pmp->pm_bpcluster ^ (1 << pmp->pm_cnshift)) { error = EINVAL; goto error_exit; } /* * Release the bootsector buffer. */ brelse(bp); bp = NULL; /* * Check the fsinfo sector if we have one. Silently fix up our * in-core copy of fp->fsinxtfree if it is unknown (0xffffffff) * or too large. Ignore fp->fsinfree for now, since we need to * read the entire FAT anyway to fill the inuse map. */ if (pmp->pm_fsinfo) { struct fsinfo *fp; if ((error = bread(devvp, pmp->pm_fsinfo, pmp->pm_BytesPerSec, NOCRED, &bp)) != 0) goto error_exit; fp = (struct fsinfo *)bp->b_data; if (!bcmp(fp->fsisig1, "RRaA", 4) && !bcmp(fp->fsisig2, "rrAa", 4) && !bcmp(fp->fsisig3, "\0\0\125\252", 4)) { pmp->pm_nxtfree = getulong(fp->fsinxtfree); if (pmp->pm_nxtfree > pmp->pm_maxcluster) pmp->pm_nxtfree = CLUST_FIRST; } else pmp->pm_fsinfo = 0; brelse(bp); bp = NULL; } /* * Finish initializing pmp->pm_nxtfree (just in case the first few * sectors aren't properly reserved in the FAT). This completes * the fixup for fp->fsinxtfree, and fixes up the zero-initialized * value if there is no fsinfo. We will use pmp->pm_nxtfree * internally even if there is no fsinfo. */ if (pmp->pm_nxtfree < CLUST_FIRST) pmp->pm_nxtfree = CLUST_FIRST; /* * Allocate memory for the bitmap of allocated clusters, and then * fill it in. */ pmp->pm_inusemap = malloc(howmany(pmp->pm_maxcluster + 1, N_INUSEBITS) * sizeof(*pmp->pm_inusemap), M_MSDOSFSFAT, M_WAITOK); /* * fillinusemap() needs pm_devvp. */ pmp->pm_devvp = devvp; pmp->pm_dev = dev; /* * Have the inuse map filled in. */ MSDOSFS_LOCK_MP(pmp); error = fillinusemap(pmp); MSDOSFS_UNLOCK_MP(pmp); if (error != 0) goto error_exit; /* * If they want FAT updates to be synchronous then let them suffer * the performance degradation in exchange for the on disk copy of * the FAT being correct just about all the time. I suppose this * would be a good thing to turn on if the kernel is still flakey. */ if (mp->mnt_flag & MNT_SYNCHRONOUS) pmp->pm_flags |= MSDOSFSMNT_WAITONFAT; /* * Finish up. */ if (ronly) pmp->pm_flags |= MSDOSFSMNT_RONLY; else { - if ((error = markvoldirty(pmp, 1)) != 0) { - (void)markvoldirty(pmp, 0); + if ((error = markvoldirty(pmp, 1)) != 0) goto error_exit; - } pmp->pm_fmod = 1; } mp->mnt_data = pmp; mp->mnt_stat.f_fsid.val[0] = dev2udev(dev); mp->mnt_stat.f_fsid.val[1] = mp->mnt_vfc->vfc_typenum; MNT_ILOCK(mp); mp->mnt_flag |= MNT_LOCAL; mp->mnt_kern_flag |= MNTK_USES_BCACHE | MNTK_NO_IOPF; MNT_IUNLOCK(mp); return (0); error_exit: if (bp) brelse(bp); if (cp != NULL) { g_topology_lock(); g_vfs_close(cp); g_topology_unlock(); } if (pmp) { lockdestroy(&pmp->pm_fatlock); free(pmp->pm_inusemap, M_MSDOSFSFAT); free(pmp, M_MSDOSFSMNT); mp->mnt_data = NULL; } atomic_store_rel_ptr((uintptr_t *)&dev->si_mountpt, 0); dev_rel(dev); return (error); } /* * Unmount the filesystem described by mp. */ static int msdosfs_unmount(struct mount *mp, int mntflags) { struct msdosfsmount *pmp; int error, flags; error = flags = 0; pmp = VFSTOMSDOSFS(mp); if ((pmp->pm_flags & MSDOSFSMNT_RONLY) == 0) error = msdosfs_sync(mp, MNT_WAIT); if ((mntflags & MNT_FORCE) != 0) flags |= FORCECLOSE; else if (error != 0) return (error); error = vflush(mp, 0, flags, curthread); if (error != 0 && error != ENXIO) return (error); if ((pmp->pm_flags & MSDOSFSMNT_RONLY) == 0) { error = markvoldirty(pmp, 0); if (error && error != ENXIO) { (void)markvoldirty(pmp, 1); return (error); } } if (pmp->pm_flags & MSDOSFSMNT_KICONV && msdosfs_iconv) { if (pmp->pm_w2u) msdosfs_iconv->close(pmp->pm_w2u); if (pmp->pm_u2w) msdosfs_iconv->close(pmp->pm_u2w); if (pmp->pm_d2u) msdosfs_iconv->close(pmp->pm_d2u); if (pmp->pm_u2d) msdosfs_iconv->close(pmp->pm_u2d); } #ifdef MSDOSFS_DEBUG { struct vnode *vp = pmp->pm_devvp; struct bufobj *bo; bo = &vp->v_bufobj; BO_LOCK(bo); VI_LOCK(vp); vn_printf(vp, "msdosfs_umount(): just before calling VOP_CLOSE()\n"); printf("freef %p, freeb %p, mount %p\n", TAILQ_NEXT(vp, v_actfreelist), vp->v_actfreelist.tqe_prev, vp->v_mount); printf("cleanblkhd %p, dirtyblkhd %p, numoutput %ld, type %d\n", TAILQ_FIRST(&vp->v_bufobj.bo_clean.bv_hd), TAILQ_FIRST(&vp->v_bufobj.bo_dirty.bv_hd), vp->v_bufobj.bo_numoutput, vp->v_type); VI_UNLOCK(vp); BO_UNLOCK(bo); } #endif g_topology_lock(); g_vfs_close(pmp->pm_cp); g_topology_unlock(); atomic_store_rel_ptr((uintptr_t *)&pmp->pm_dev->si_mountpt, 0); vrele(pmp->pm_devvp); dev_rel(pmp->pm_dev); free(pmp->pm_inusemap, M_MSDOSFSFAT); lockdestroy(&pmp->pm_fatlock); free(pmp, M_MSDOSFSMNT); mp->mnt_data = NULL; MNT_ILOCK(mp); mp->mnt_flag &= ~MNT_LOCAL; MNT_IUNLOCK(mp); return (error); } static int msdosfs_root(struct mount *mp, int flags, struct vnode **vpp) { struct msdosfsmount *pmp = VFSTOMSDOSFS(mp); struct denode *ndep; int error; #ifdef MSDOSFS_DEBUG printf("msdosfs_root(); mp %p, pmp %p\n", mp, pmp); #endif error = deget(pmp, MSDOSFSROOT, MSDOSFSROOT_OFS, &ndep); if (error) return (error); *vpp = DETOV(ndep); return (0); } static int msdosfs_statfs(struct mount *mp, struct statfs *sbp) { struct msdosfsmount *pmp; pmp = VFSTOMSDOSFS(mp); sbp->f_bsize = pmp->pm_bpcluster; sbp->f_iosize = pmp->pm_bpcluster; sbp->f_blocks = pmp->pm_maxcluster + 1; sbp->f_bfree = pmp->pm_freeclustercount; sbp->f_bavail = pmp->pm_freeclustercount; sbp->f_files = pmp->pm_RootDirEnts; /* XXX */ sbp->f_ffree = 0; /* what to put in here? */ return (0); } /* * If we have an FSInfo block, update it. */ static int msdosfs_fsiflush(struct msdosfsmount *pmp, int waitfor) { struct fsinfo *fp; struct buf *bp; int error; MSDOSFS_LOCK_MP(pmp); if (pmp->pm_fsinfo == 0 || (pmp->pm_flags & MSDOSFS_FSIMOD) == 0) { error = 0; goto unlock; } error = bread(pmp->pm_devvp, pmp->pm_fsinfo, pmp->pm_BytesPerSec, NOCRED, &bp); if (error != 0) { goto unlock; } fp = (struct fsinfo *)bp->b_data; putulong(fp->fsinfree, pmp->pm_freeclustercount); putulong(fp->fsinxtfree, pmp->pm_nxtfree); pmp->pm_flags &= ~MSDOSFS_FSIMOD; if (waitfor == MNT_WAIT) error = bwrite(bp); else bawrite(bp); unlock: MSDOSFS_UNLOCK_MP(pmp); return (error); } static int msdosfs_sync(struct mount *mp, int waitfor) { struct vnode *vp, *nvp; struct thread *td; struct denode *dep; struct msdosfsmount *pmp = VFSTOMSDOSFS(mp); int error, allerror = 0; td = curthread; /* * If we ever switch to not updating all of the FATs all the time, * this would be the place to update them from the first one. */ if (pmp->pm_fmod != 0) { if (pmp->pm_flags & MSDOSFSMNT_RONLY) panic("msdosfs_sync: rofs mod"); else { /* update FATs here */ } } /* * Write back each (modified) denode. */ loop: MNT_VNODE_FOREACH_ALL(vp, mp, nvp) { if (vp->v_type == VNON) { VI_UNLOCK(vp); continue; } dep = VTODE(vp); if ((dep->de_flag & (DE_ACCESS | DE_CREATE | DE_UPDATE | DE_MODIFIED)) == 0 && (vp->v_bufobj.bo_dirty.bv_cnt == 0 || waitfor == MNT_LAZY)) { VI_UNLOCK(vp); continue; } error = vget(vp, LK_EXCLUSIVE | LK_NOWAIT | LK_INTERLOCK, td); if (error) { if (error == ENOENT) goto loop; continue; } error = VOP_FSYNC(vp, waitfor, td); if (error) allerror = error; VOP_UNLOCK(vp, 0); vrele(vp); } /* * Flush filesystem control info. */ if (waitfor != MNT_LAZY) { vn_lock(pmp->pm_devvp, LK_EXCLUSIVE | LK_RETRY); error = VOP_FSYNC(pmp->pm_devvp, waitfor, td); if (error) allerror = error; VOP_UNLOCK(pmp->pm_devvp, 0); } error = msdosfs_fsiflush(pmp, waitfor); if (error != 0) allerror = error; return (allerror); } static int msdosfs_fhtovp(struct mount *mp, struct fid *fhp, int flags, struct vnode **vpp) { struct msdosfsmount *pmp = VFSTOMSDOSFS(mp); struct defid *defhp = (struct defid *) fhp; struct denode *dep; int error; error = deget(pmp, defhp->defid_dirclust, defhp->defid_dirofs, &dep); if (error) { *vpp = NULLVP; return (error); } *vpp = DETOV(dep); vnode_create_vobject(*vpp, dep->de_FileSize, curthread); return (0); } static struct vfsops msdosfs_vfsops = { .vfs_fhtovp = msdosfs_fhtovp, .vfs_mount = msdosfs_mount, .vfs_cmount = msdosfs_cmount, .vfs_root = msdosfs_root, .vfs_statfs = msdosfs_statfs, .vfs_sync = msdosfs_sync, .vfs_unmount = msdosfs_unmount, }; VFS_SET(msdosfs_vfsops, msdosfs, 0); MODULE_VERSION(msdosfs, 1); Index: head/sys/kern/vfs_bio.c =================================================================== --- head/sys/kern/vfs_bio.c (revision 352232) +++ head/sys/kern/vfs_bio.c (revision 352233) @@ -1,5450 +1,5463 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2004 Poul-Henning Kamp * Copyright (c) 1994,1997 John S. Dyson * Copyright (c) 2013 The FreeBSD Foundation * All rights reserved. * * Portions of this software were developed by Konstantin Belousov * under sponsorship from the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * this file contains a new buffer I/O scheme implementing a coherent * VM object and buffer cache scheme. Pains have been taken to make * sure that the performance degradation associated with schemes such * as this is not realized. * * Author: John S. Dyson * Significant help during the development and debugging phases * had been provided by David Greenman, also of the FreeBSD core team. * * see man buf(9) for more info. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static MALLOC_DEFINE(M_BIOBUF, "biobuf", "BIO buffer"); struct bio_ops bioops; /* I/O operation notification */ struct buf_ops buf_ops_bio = { .bop_name = "buf_ops_bio", .bop_write = bufwrite, .bop_strategy = bufstrategy, .bop_sync = bufsync, .bop_bdflush = bufbdflush, }; struct bufqueue { struct mtx_padalign bq_lock; TAILQ_HEAD(, buf) bq_queue; uint8_t bq_index; uint16_t bq_subqueue; int bq_len; } __aligned(CACHE_LINE_SIZE); #define BQ_LOCKPTR(bq) (&(bq)->bq_lock) #define BQ_LOCK(bq) mtx_lock(BQ_LOCKPTR((bq))) #define BQ_UNLOCK(bq) mtx_unlock(BQ_LOCKPTR((bq))) #define BQ_ASSERT_LOCKED(bq) mtx_assert(BQ_LOCKPTR((bq)), MA_OWNED) struct bufdomain { struct bufqueue bd_subq[MAXCPU + 1]; /* Per-cpu sub queues + global */ struct bufqueue bd_dirtyq; struct bufqueue *bd_cleanq; struct mtx_padalign bd_run_lock; /* Constants */ long bd_maxbufspace; long bd_hibufspace; long bd_lobufspace; long bd_bufspacethresh; int bd_hifreebuffers; int bd_lofreebuffers; int bd_hidirtybuffers; int bd_lodirtybuffers; int bd_dirtybufthresh; int bd_lim; /* atomics */ int bd_wanted; int __aligned(CACHE_LINE_SIZE) bd_numdirtybuffers; int __aligned(CACHE_LINE_SIZE) bd_running; long __aligned(CACHE_LINE_SIZE) bd_bufspace; int __aligned(CACHE_LINE_SIZE) bd_freebuffers; } __aligned(CACHE_LINE_SIZE); #define BD_LOCKPTR(bd) (&(bd)->bd_cleanq->bq_lock) #define BD_LOCK(bd) mtx_lock(BD_LOCKPTR((bd))) #define BD_UNLOCK(bd) mtx_unlock(BD_LOCKPTR((bd))) #define BD_ASSERT_LOCKED(bd) mtx_assert(BD_LOCKPTR((bd)), MA_OWNED) #define BD_RUN_LOCKPTR(bd) (&(bd)->bd_run_lock) #define BD_RUN_LOCK(bd) mtx_lock(BD_RUN_LOCKPTR((bd))) #define BD_RUN_UNLOCK(bd) mtx_unlock(BD_RUN_LOCKPTR((bd))) #define BD_DOMAIN(bd) (bd - bdomain) static struct buf *buf; /* buffer header pool */ extern struct buf *swbuf; /* Swap buffer header pool. */ caddr_t unmapped_buf; /* Used below and for softdep flushing threads in ufs/ffs/ffs_softdep.c */ struct proc *bufdaemonproc; static int inmem(struct vnode *vp, daddr_t blkno); static void vm_hold_free_pages(struct buf *bp, int newbsize); static void vm_hold_load_pages(struct buf *bp, vm_offset_t from, vm_offset_t to); static void vfs_page_set_valid(struct buf *bp, vm_ooffset_t off, vm_page_t m); static void vfs_page_set_validclean(struct buf *bp, vm_ooffset_t off, vm_page_t m); static void vfs_clean_pages_dirty_buf(struct buf *bp); static void vfs_setdirty_locked_object(struct buf *bp); static void vfs_vmio_invalidate(struct buf *bp); static void vfs_vmio_truncate(struct buf *bp, int npages); static void vfs_vmio_extend(struct buf *bp, int npages, int size); static int vfs_bio_clcheck(struct vnode *vp, int size, daddr_t lblkno, daddr_t blkno); static void breada(struct vnode *, daddr_t *, int *, int, struct ucred *, int, void (*)(struct buf *)); static int buf_flush(struct vnode *vp, struct bufdomain *, int); static int flushbufqueues(struct vnode *, struct bufdomain *, int, int); static void buf_daemon(void); static __inline void bd_wakeup(void); static int sysctl_runningspace(SYSCTL_HANDLER_ARGS); static void bufkva_reclaim(vmem_t *, int); static void bufkva_free(struct buf *); static int buf_import(void *, void **, int, int, int); static void buf_release(void *, void **, int); static void maxbcachebuf_adjust(void); static inline struct bufdomain *bufdomain(struct buf *); static void bq_remove(struct bufqueue *bq, struct buf *bp); static void bq_insert(struct bufqueue *bq, struct buf *bp, bool unlock); static int buf_recycle(struct bufdomain *, bool kva); static void bq_init(struct bufqueue *bq, int qindex, int cpu, const char *lockname); static void bd_init(struct bufdomain *bd); static int bd_flushall(struct bufdomain *bd); static int sysctl_bufdomain_long(SYSCTL_HANDLER_ARGS); static int sysctl_bufdomain_int(SYSCTL_HANDLER_ARGS); static int sysctl_bufspace(SYSCTL_HANDLER_ARGS); int vmiodirenable = TRUE; SYSCTL_INT(_vfs, OID_AUTO, vmiodirenable, CTLFLAG_RW, &vmiodirenable, 0, "Use the VM system for directory writes"); long runningbufspace; SYSCTL_LONG(_vfs, OID_AUTO, runningbufspace, CTLFLAG_RD, &runningbufspace, 0, "Amount of presently outstanding async buffer io"); SYSCTL_PROC(_vfs, OID_AUTO, bufspace, CTLTYPE_LONG|CTLFLAG_MPSAFE|CTLFLAG_RD, NULL, 0, sysctl_bufspace, "L", "Physical memory used for buffers"); static counter_u64_t bufkvaspace; SYSCTL_COUNTER_U64(_vfs, OID_AUTO, bufkvaspace, CTLFLAG_RD, &bufkvaspace, "Kernel virtual memory used for buffers"); static long maxbufspace; SYSCTL_PROC(_vfs, OID_AUTO, maxbufspace, CTLTYPE_LONG|CTLFLAG_MPSAFE|CTLFLAG_RW, &maxbufspace, __offsetof(struct bufdomain, bd_maxbufspace), sysctl_bufdomain_long, "L", "Maximum allowed value of bufspace (including metadata)"); static long bufmallocspace; SYSCTL_LONG(_vfs, OID_AUTO, bufmallocspace, CTLFLAG_RD, &bufmallocspace, 0, "Amount of malloced memory for buffers"); static long maxbufmallocspace; SYSCTL_LONG(_vfs, OID_AUTO, maxmallocbufspace, CTLFLAG_RW, &maxbufmallocspace, 0, "Maximum amount of malloced memory for buffers"); static long lobufspace; SYSCTL_PROC(_vfs, OID_AUTO, lobufspace, CTLTYPE_LONG|CTLFLAG_MPSAFE|CTLFLAG_RW, &lobufspace, __offsetof(struct bufdomain, bd_lobufspace), sysctl_bufdomain_long, "L", "Minimum amount of buffers we want to have"); long hibufspace; SYSCTL_PROC(_vfs, OID_AUTO, hibufspace, CTLTYPE_LONG|CTLFLAG_MPSAFE|CTLFLAG_RW, &hibufspace, __offsetof(struct bufdomain, bd_hibufspace), sysctl_bufdomain_long, "L", "Maximum allowed value of bufspace (excluding metadata)"); long bufspacethresh; SYSCTL_PROC(_vfs, OID_AUTO, bufspacethresh, CTLTYPE_LONG|CTLFLAG_MPSAFE|CTLFLAG_RW, &bufspacethresh, __offsetof(struct bufdomain, bd_bufspacethresh), sysctl_bufdomain_long, "L", "Bufspace consumed before waking the daemon to free some"); static counter_u64_t buffreekvacnt; SYSCTL_COUNTER_U64(_vfs, OID_AUTO, buffreekvacnt, CTLFLAG_RW, &buffreekvacnt, "Number of times we have freed the KVA space from some buffer"); static counter_u64_t bufdefragcnt; SYSCTL_COUNTER_U64(_vfs, OID_AUTO, bufdefragcnt, CTLFLAG_RW, &bufdefragcnt, "Number of times we have had to repeat buffer allocation to defragment"); static long lorunningspace; SYSCTL_PROC(_vfs, OID_AUTO, lorunningspace, CTLTYPE_LONG | CTLFLAG_MPSAFE | CTLFLAG_RW, &lorunningspace, 0, sysctl_runningspace, "L", "Minimum preferred space used for in-progress I/O"); static long hirunningspace; SYSCTL_PROC(_vfs, OID_AUTO, hirunningspace, CTLTYPE_LONG | CTLFLAG_MPSAFE | CTLFLAG_RW, &hirunningspace, 0, sysctl_runningspace, "L", "Maximum amount of space to use for in-progress I/O"); int dirtybufferflushes; SYSCTL_INT(_vfs, OID_AUTO, dirtybufferflushes, CTLFLAG_RW, &dirtybufferflushes, 0, "Number of bdwrite to bawrite conversions to limit dirty buffers"); int bdwriteskip; SYSCTL_INT(_vfs, OID_AUTO, bdwriteskip, CTLFLAG_RW, &bdwriteskip, 0, "Number of buffers supplied to bdwrite with snapshot deadlock risk"); int altbufferflushes; SYSCTL_INT(_vfs, OID_AUTO, altbufferflushes, CTLFLAG_RW, &altbufferflushes, 0, "Number of fsync flushes to limit dirty buffers"); static int recursiveflushes; SYSCTL_INT(_vfs, OID_AUTO, recursiveflushes, CTLFLAG_RW, &recursiveflushes, 0, "Number of flushes skipped due to being recursive"); static int sysctl_numdirtybuffers(SYSCTL_HANDLER_ARGS); SYSCTL_PROC(_vfs, OID_AUTO, numdirtybuffers, CTLTYPE_INT|CTLFLAG_MPSAFE|CTLFLAG_RD, NULL, 0, sysctl_numdirtybuffers, "I", "Number of buffers that are dirty (has unwritten changes) at the moment"); static int lodirtybuffers; SYSCTL_PROC(_vfs, OID_AUTO, lodirtybuffers, CTLTYPE_INT|CTLFLAG_MPSAFE|CTLFLAG_RW, &lodirtybuffers, __offsetof(struct bufdomain, bd_lodirtybuffers), sysctl_bufdomain_int, "I", "How many buffers we want to have free before bufdaemon can sleep"); static int hidirtybuffers; SYSCTL_PROC(_vfs, OID_AUTO, hidirtybuffers, CTLTYPE_INT|CTLFLAG_MPSAFE|CTLFLAG_RW, &hidirtybuffers, __offsetof(struct bufdomain, bd_hidirtybuffers), sysctl_bufdomain_int, "I", "When the number of dirty buffers is considered severe"); int dirtybufthresh; SYSCTL_PROC(_vfs, OID_AUTO, dirtybufthresh, CTLTYPE_INT|CTLFLAG_MPSAFE|CTLFLAG_RW, &dirtybufthresh, __offsetof(struct bufdomain, bd_dirtybufthresh), sysctl_bufdomain_int, "I", "Number of bdwrite to bawrite conversions to clear dirty buffers"); static int numfreebuffers; SYSCTL_INT(_vfs, OID_AUTO, numfreebuffers, CTLFLAG_RD, &numfreebuffers, 0, "Number of free buffers"); static int lofreebuffers; SYSCTL_PROC(_vfs, OID_AUTO, lofreebuffers, CTLTYPE_INT|CTLFLAG_MPSAFE|CTLFLAG_RW, &lofreebuffers, __offsetof(struct bufdomain, bd_lofreebuffers), sysctl_bufdomain_int, "I", "Target number of free buffers"); static int hifreebuffers; SYSCTL_PROC(_vfs, OID_AUTO, hifreebuffers, CTLTYPE_INT|CTLFLAG_MPSAFE|CTLFLAG_RW, &hifreebuffers, __offsetof(struct bufdomain, bd_hifreebuffers), sysctl_bufdomain_int, "I", "Threshold for clean buffer recycling"); static counter_u64_t getnewbufcalls; SYSCTL_COUNTER_U64(_vfs, OID_AUTO, getnewbufcalls, CTLFLAG_RD, &getnewbufcalls, "Number of calls to getnewbuf"); static counter_u64_t getnewbufrestarts; SYSCTL_COUNTER_U64(_vfs, OID_AUTO, getnewbufrestarts, CTLFLAG_RD, &getnewbufrestarts, "Number of times getnewbuf has had to restart a buffer acquisition"); static counter_u64_t mappingrestarts; SYSCTL_COUNTER_U64(_vfs, OID_AUTO, mappingrestarts, CTLFLAG_RD, &mappingrestarts, "Number of times getblk has had to restart a buffer mapping for " "unmapped buffer"); static counter_u64_t numbufallocfails; SYSCTL_COUNTER_U64(_vfs, OID_AUTO, numbufallocfails, CTLFLAG_RW, &numbufallocfails, "Number of times buffer allocations failed"); static int flushbufqtarget = 100; SYSCTL_INT(_vfs, OID_AUTO, flushbufqtarget, CTLFLAG_RW, &flushbufqtarget, 0, "Amount of work to do in flushbufqueues when helping bufdaemon"); static counter_u64_t notbufdflushes; SYSCTL_COUNTER_U64(_vfs, OID_AUTO, notbufdflushes, CTLFLAG_RD, ¬bufdflushes, "Number of dirty buffer flushes done by the bufdaemon helpers"); static long barrierwrites; SYSCTL_LONG(_vfs, OID_AUTO, barrierwrites, CTLFLAG_RW, &barrierwrites, 0, "Number of barrier writes"); SYSCTL_INT(_vfs, OID_AUTO, unmapped_buf_allowed, CTLFLAG_RD, &unmapped_buf_allowed, 0, "Permit the use of the unmapped i/o"); int maxbcachebuf = MAXBCACHEBUF; SYSCTL_INT(_vfs, OID_AUTO, maxbcachebuf, CTLFLAG_RDTUN, &maxbcachebuf, 0, "Maximum size of a buffer cache block"); /* * This lock synchronizes access to bd_request. */ static struct mtx_padalign __exclusive_cache_line bdlock; /* * This lock protects the runningbufreq and synchronizes runningbufwakeup and * waitrunningbufspace(). */ static struct mtx_padalign __exclusive_cache_line rbreqlock; /* * Lock that protects bdirtywait. */ static struct mtx_padalign __exclusive_cache_line bdirtylock; /* * Wakeup point for bufdaemon, as well as indicator of whether it is already * active. Set to 1 when the bufdaemon is already "on" the queue, 0 when it * is idling. */ static int bd_request; /* * Request for the buf daemon to write more buffers than is indicated by * lodirtybuf. This may be necessary to push out excess dependencies or * defragment the address space where a simple count of the number of dirty * buffers is insufficient to characterize the demand for flushing them. */ static int bd_speedupreq; /* * Synchronization (sleep/wakeup) variable for active buffer space requests. * Set when wait starts, cleared prior to wakeup(). * Used in runningbufwakeup() and waitrunningbufspace(). */ static int runningbufreq; /* * Synchronization for bwillwrite() waiters. */ static int bdirtywait; /* * Definitions for the buffer free lists. */ #define QUEUE_NONE 0 /* on no queue */ #define QUEUE_EMPTY 1 /* empty buffer headers */ #define QUEUE_DIRTY 2 /* B_DELWRI buffers */ #define QUEUE_CLEAN 3 /* non-B_DELWRI buffers */ #define QUEUE_SENTINEL 4 /* not an queue index, but mark for sentinel */ /* Maximum number of buffer domains. */ #define BUF_DOMAINS 8 struct bufdomainset bdlodirty; /* Domains > lodirty */ struct bufdomainset bdhidirty; /* Domains > hidirty */ /* Configured number of clean queues. */ static int __read_mostly buf_domains; BITSET_DEFINE(bufdomainset, BUF_DOMAINS); struct bufdomain __exclusive_cache_line bdomain[BUF_DOMAINS]; struct bufqueue __exclusive_cache_line bqempty; /* * per-cpu empty buffer cache. */ uma_zone_t buf_zone; /* * Single global constant for BUF_WMESG, to avoid getting multiple references. * buf_wmesg is referred from macros. */ const char *buf_wmesg = BUF_WMESG; static int sysctl_runningspace(SYSCTL_HANDLER_ARGS) { long value; int error; value = *(long *)arg1; error = sysctl_handle_long(oidp, &value, 0, req); if (error != 0 || req->newptr == NULL) return (error); mtx_lock(&rbreqlock); if (arg1 == &hirunningspace) { if (value < lorunningspace) error = EINVAL; else hirunningspace = value; } else { KASSERT(arg1 == &lorunningspace, ("%s: unknown arg1", __func__)); if (value > hirunningspace) error = EINVAL; else lorunningspace = value; } mtx_unlock(&rbreqlock); return (error); } static int sysctl_bufdomain_int(SYSCTL_HANDLER_ARGS) { int error; int value; int i; value = *(int *)arg1; error = sysctl_handle_int(oidp, &value, 0, req); if (error != 0 || req->newptr == NULL) return (error); *(int *)arg1 = value; for (i = 0; i < buf_domains; i++) *(int *)(uintptr_t)(((uintptr_t)&bdomain[i]) + arg2) = value / buf_domains; return (error); } static int sysctl_bufdomain_long(SYSCTL_HANDLER_ARGS) { long value; int error; int i; value = *(long *)arg1; error = sysctl_handle_long(oidp, &value, 0, req); if (error != 0 || req->newptr == NULL) return (error); *(long *)arg1 = value; for (i = 0; i < buf_domains; i++) *(long *)(uintptr_t)(((uintptr_t)&bdomain[i]) + arg2) = value / buf_domains; return (error); } #if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \ defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7) static int sysctl_bufspace(SYSCTL_HANDLER_ARGS) { long lvalue; int ivalue; int i; lvalue = 0; for (i = 0; i < buf_domains; i++) lvalue += bdomain[i].bd_bufspace; if (sizeof(int) == sizeof(long) || req->oldlen >= sizeof(long)) return (sysctl_handle_long(oidp, &lvalue, 0, req)); if (lvalue > INT_MAX) /* On overflow, still write out a long to trigger ENOMEM. */ return (sysctl_handle_long(oidp, &lvalue, 0, req)); ivalue = lvalue; return (sysctl_handle_int(oidp, &ivalue, 0, req)); } #else static int sysctl_bufspace(SYSCTL_HANDLER_ARGS) { long lvalue; int i; lvalue = 0; for (i = 0; i < buf_domains; i++) lvalue += bdomain[i].bd_bufspace; return (sysctl_handle_long(oidp, &lvalue, 0, req)); } #endif static int sysctl_numdirtybuffers(SYSCTL_HANDLER_ARGS) { int value; int i; value = 0; for (i = 0; i < buf_domains; i++) value += bdomain[i].bd_numdirtybuffers; return (sysctl_handle_int(oidp, &value, 0, req)); } /* * bdirtywakeup: * * Wakeup any bwillwrite() waiters. */ static void bdirtywakeup(void) { mtx_lock(&bdirtylock); if (bdirtywait) { bdirtywait = 0; wakeup(&bdirtywait); } mtx_unlock(&bdirtylock); } /* * bd_clear: * * Clear a domain from the appropriate bitsets when dirtybuffers * is decremented. */ static void bd_clear(struct bufdomain *bd) { mtx_lock(&bdirtylock); if (bd->bd_numdirtybuffers <= bd->bd_lodirtybuffers) BIT_CLR(BUF_DOMAINS, BD_DOMAIN(bd), &bdlodirty); if (bd->bd_numdirtybuffers <= bd->bd_hidirtybuffers) BIT_CLR(BUF_DOMAINS, BD_DOMAIN(bd), &bdhidirty); mtx_unlock(&bdirtylock); } /* * bd_set: * * Set a domain in the appropriate bitsets when dirtybuffers * is incremented. */ static void bd_set(struct bufdomain *bd) { mtx_lock(&bdirtylock); if (bd->bd_numdirtybuffers > bd->bd_lodirtybuffers) BIT_SET(BUF_DOMAINS, BD_DOMAIN(bd), &bdlodirty); if (bd->bd_numdirtybuffers > bd->bd_hidirtybuffers) BIT_SET(BUF_DOMAINS, BD_DOMAIN(bd), &bdhidirty); mtx_unlock(&bdirtylock); } /* * bdirtysub: * * Decrement the numdirtybuffers count by one and wakeup any * threads blocked in bwillwrite(). */ static void bdirtysub(struct buf *bp) { struct bufdomain *bd; int num; bd = bufdomain(bp); num = atomic_fetchadd_int(&bd->bd_numdirtybuffers, -1); if (num == (bd->bd_lodirtybuffers + bd->bd_hidirtybuffers) / 2) bdirtywakeup(); if (num == bd->bd_lodirtybuffers || num == bd->bd_hidirtybuffers) bd_clear(bd); } /* * bdirtyadd: * * Increment the numdirtybuffers count by one and wakeup the buf * daemon if needed. */ static void bdirtyadd(struct buf *bp) { struct bufdomain *bd; int num; /* * Only do the wakeup once as we cross the boundary. The * buf daemon will keep running until the condition clears. */ bd = bufdomain(bp); num = atomic_fetchadd_int(&bd->bd_numdirtybuffers, 1); if (num == (bd->bd_lodirtybuffers + bd->bd_hidirtybuffers) / 2) bd_wakeup(); if (num == bd->bd_lodirtybuffers || num == bd->bd_hidirtybuffers) bd_set(bd); } /* * bufspace_daemon_wakeup: * * Wakeup the daemons responsible for freeing clean bufs. */ static void bufspace_daemon_wakeup(struct bufdomain *bd) { /* * avoid the lock if the daemon is running. */ if (atomic_fetchadd_int(&bd->bd_running, 1) == 0) { BD_RUN_LOCK(bd); atomic_store_int(&bd->bd_running, 1); wakeup(&bd->bd_running); BD_RUN_UNLOCK(bd); } } /* * bufspace_daemon_wait: * * Sleep until the domain falls below a limit or one second passes. */ static void bufspace_daemon_wait(struct bufdomain *bd) { /* * Re-check our limits and sleep. bd_running must be * cleared prior to checking the limits to avoid missed * wakeups. The waker will adjust one of bufspace or * freebuffers prior to checking bd_running. */ BD_RUN_LOCK(bd); atomic_store_int(&bd->bd_running, 0); if (bd->bd_bufspace < bd->bd_bufspacethresh && bd->bd_freebuffers > bd->bd_lofreebuffers) { msleep(&bd->bd_running, BD_RUN_LOCKPTR(bd), PRIBIO|PDROP, "-", hz); } else { /* Avoid spurious wakeups while running. */ atomic_store_int(&bd->bd_running, 1); BD_RUN_UNLOCK(bd); } } /* * bufspace_adjust: * * Adjust the reported bufspace for a KVA managed buffer, possibly * waking any waiters. */ static void bufspace_adjust(struct buf *bp, int bufsize) { struct bufdomain *bd; long space; int diff; KASSERT((bp->b_flags & B_MALLOC) == 0, ("bufspace_adjust: malloc buf %p", bp)); bd = bufdomain(bp); diff = bufsize - bp->b_bufsize; if (diff < 0) { atomic_subtract_long(&bd->bd_bufspace, -diff); } else if (diff > 0) { space = atomic_fetchadd_long(&bd->bd_bufspace, diff); /* Wake up the daemon on the transition. */ if (space < bd->bd_bufspacethresh && space + diff >= bd->bd_bufspacethresh) bufspace_daemon_wakeup(bd); } bp->b_bufsize = bufsize; } /* * bufspace_reserve: * * Reserve bufspace before calling allocbuf(). metadata has a * different space limit than data. */ static int bufspace_reserve(struct bufdomain *bd, int size, bool metadata) { long limit, new; long space; if (metadata) limit = bd->bd_maxbufspace; else limit = bd->bd_hibufspace; space = atomic_fetchadd_long(&bd->bd_bufspace, size); new = space + size; if (new > limit) { atomic_subtract_long(&bd->bd_bufspace, size); return (ENOSPC); } /* Wake up the daemon on the transition. */ if (space < bd->bd_bufspacethresh && new >= bd->bd_bufspacethresh) bufspace_daemon_wakeup(bd); return (0); } /* * bufspace_release: * * Release reserved bufspace after bufspace_adjust() has consumed it. */ static void bufspace_release(struct bufdomain *bd, int size) { atomic_subtract_long(&bd->bd_bufspace, size); } /* * bufspace_wait: * * Wait for bufspace, acting as the buf daemon if a locked vnode is * supplied. bd_wanted must be set prior to polling for space. The * operation must be re-tried on return. */ static void bufspace_wait(struct bufdomain *bd, struct vnode *vp, int gbflags, int slpflag, int slptimeo) { struct thread *td; int error, fl, norunbuf; if ((gbflags & GB_NOWAIT_BD) != 0) return; td = curthread; BD_LOCK(bd); while (bd->bd_wanted) { if (vp != NULL && vp->v_type != VCHR && (td->td_pflags & TDP_BUFNEED) == 0) { BD_UNLOCK(bd); /* * getblk() is called with a vnode locked, and * some majority of the dirty buffers may as * well belong to the vnode. Flushing the * buffers there would make a progress that * cannot be achieved by the buf_daemon, that * cannot lock the vnode. */ norunbuf = ~(TDP_BUFNEED | TDP_NORUNNINGBUF) | (td->td_pflags & TDP_NORUNNINGBUF); /* * Play bufdaemon. The getnewbuf() function * may be called while the thread owns lock * for another dirty buffer for the same * vnode, which makes it impossible to use * VOP_FSYNC() there, due to the buffer lock * recursion. */ td->td_pflags |= TDP_BUFNEED | TDP_NORUNNINGBUF; fl = buf_flush(vp, bd, flushbufqtarget); td->td_pflags &= norunbuf; BD_LOCK(bd); if (fl != 0) continue; if (bd->bd_wanted == 0) break; } error = msleep(&bd->bd_wanted, BD_LOCKPTR(bd), (PRIBIO + 4) | slpflag, "newbuf", slptimeo); if (error != 0) break; } BD_UNLOCK(bd); } /* * bufspace_daemon: * * buffer space management daemon. Tries to maintain some marginal * amount of free buffer space so that requesting processes neither * block nor work to reclaim buffers. */ static void bufspace_daemon(void *arg) { struct bufdomain *bd; EVENTHANDLER_REGISTER(shutdown_pre_sync, kthread_shutdown, curthread, SHUTDOWN_PRI_LAST + 100); bd = arg; for (;;) { kthread_suspend_check(); /* * Free buffers from the clean queue until we meet our * targets. * * Theory of operation: The buffer cache is most efficient * when some free buffer headers and space are always * available to getnewbuf(). This daemon attempts to prevent * the excessive blocking and synchronization associated * with shortfall. It goes through three phases according * demand: * * 1) The daemon wakes up voluntarily once per-second * during idle periods when the counters are below * the wakeup thresholds (bufspacethresh, lofreebuffers). * * 2) The daemon wakes up as we cross the thresholds * ahead of any potential blocking. This may bounce * slightly according to the rate of consumption and * release. * * 3) The daemon and consumers are starved for working * clean buffers. This is the 'bufspace' sleep below * which will inefficiently trade bufs with bqrelse * until we return to condition 2. */ while (bd->bd_bufspace > bd->bd_lobufspace || bd->bd_freebuffers < bd->bd_hifreebuffers) { if (buf_recycle(bd, false) != 0) { if (bd_flushall(bd)) continue; /* * Speedup dirty if we've run out of clean * buffers. This is possible in particular * because softdep may held many bufs locked * pending writes to other bufs which are * marked for delayed write, exhausting * clean space until they are written. */ bd_speedup(); BD_LOCK(bd); if (bd->bd_wanted) { msleep(&bd->bd_wanted, BD_LOCKPTR(bd), PRIBIO|PDROP, "bufspace", hz/10); } else BD_UNLOCK(bd); } maybe_yield(); } bufspace_daemon_wait(bd); } } /* * bufmallocadjust: * * Adjust the reported bufspace for a malloc managed buffer, possibly * waking any waiters. */ static void bufmallocadjust(struct buf *bp, int bufsize) { int diff; KASSERT((bp->b_flags & B_MALLOC) != 0, ("bufmallocadjust: non-malloc buf %p", bp)); diff = bufsize - bp->b_bufsize; if (diff < 0) atomic_subtract_long(&bufmallocspace, -diff); else atomic_add_long(&bufmallocspace, diff); bp->b_bufsize = bufsize; } /* * runningwakeup: * * Wake up processes that are waiting on asynchronous writes to fall * below lorunningspace. */ static void runningwakeup(void) { mtx_lock(&rbreqlock); if (runningbufreq) { runningbufreq = 0; wakeup(&runningbufreq); } mtx_unlock(&rbreqlock); } /* * runningbufwakeup: * * Decrement the outstanding write count according. */ void runningbufwakeup(struct buf *bp) { long space, bspace; bspace = bp->b_runningbufspace; if (bspace == 0) return; space = atomic_fetchadd_long(&runningbufspace, -bspace); KASSERT(space >= bspace, ("runningbufspace underflow %ld %ld", space, bspace)); bp->b_runningbufspace = 0; /* * Only acquire the lock and wakeup on the transition from exceeding * the threshold to falling below it. */ if (space < lorunningspace) return; if (space - bspace > lorunningspace) return; runningwakeup(); } /* * waitrunningbufspace() * * runningbufspace is a measure of the amount of I/O currently * running. This routine is used in async-write situations to * prevent creating huge backups of pending writes to a device. * Only asynchronous writes are governed by this function. * * This does NOT turn an async write into a sync write. It waits * for earlier writes to complete and generally returns before the * caller's write has reached the device. */ void waitrunningbufspace(void) { mtx_lock(&rbreqlock); while (runningbufspace > hirunningspace) { runningbufreq = 1; msleep(&runningbufreq, &rbreqlock, PVM, "wdrain", 0); } mtx_unlock(&rbreqlock); } /* * vfs_buf_test_cache: * * Called when a buffer is extended. This function clears the B_CACHE * bit if the newly extended portion of the buffer does not contain * valid data. */ static __inline void vfs_buf_test_cache(struct buf *bp, vm_ooffset_t foff, vm_offset_t off, vm_offset_t size, vm_page_t m) { VM_OBJECT_ASSERT_LOCKED(m->object); if (bp->b_flags & B_CACHE) { int base = (foff + off) & PAGE_MASK; if (vm_page_is_valid(m, base, size) == 0) bp->b_flags &= ~B_CACHE; } } /* Wake up the buffer daemon if necessary */ static void bd_wakeup(void) { mtx_lock(&bdlock); if (bd_request == 0) { bd_request = 1; wakeup(&bd_request); } mtx_unlock(&bdlock); } /* * Adjust the maxbcachbuf tunable. */ static void maxbcachebuf_adjust(void) { int i; /* * maxbcachebuf must be a power of 2 >= MAXBSIZE. */ i = 2; while (i * 2 <= maxbcachebuf) i *= 2; maxbcachebuf = i; if (maxbcachebuf < MAXBSIZE) maxbcachebuf = MAXBSIZE; if (maxbcachebuf > MAXPHYS) maxbcachebuf = MAXPHYS; if (bootverbose != 0 && maxbcachebuf != MAXBCACHEBUF) printf("maxbcachebuf=%d\n", maxbcachebuf); } /* * bd_speedup - speedup the buffer cache flushing code */ void bd_speedup(void) { int needwake; mtx_lock(&bdlock); needwake = 0; if (bd_speedupreq == 0 || bd_request == 0) needwake = 1; bd_speedupreq = 1; bd_request = 1; if (needwake) wakeup(&bd_request); mtx_unlock(&bdlock); } #ifdef __i386__ #define TRANSIENT_DENOM 5 #else #define TRANSIENT_DENOM 10 #endif /* * Calculating buffer cache scaling values and reserve space for buffer * headers. This is called during low level kernel initialization and * may be called more then once. We CANNOT write to the memory area * being reserved at this time. */ caddr_t kern_vfs_bio_buffer_alloc(caddr_t v, long physmem_est) { int tuned_nbuf; long maxbuf, maxbuf_sz, buf_sz, biotmap_sz; /* * physmem_est is in pages. Convert it to kilobytes (assumes * PAGE_SIZE is >= 1K) */ physmem_est = physmem_est * (PAGE_SIZE / 1024); maxbcachebuf_adjust(); /* * The nominal buffer size (and minimum KVA allocation) is BKVASIZE. * For the first 64MB of ram nominally allocate sufficient buffers to * cover 1/4 of our ram. Beyond the first 64MB allocate additional * buffers to cover 1/10 of our ram over 64MB. When auto-sizing * the buffer cache we limit the eventual kva reservation to * maxbcache bytes. * * factor represents the 1/4 x ram conversion. */ if (nbuf == 0) { int factor = 4 * BKVASIZE / 1024; nbuf = 50; if (physmem_est > 4096) nbuf += min((physmem_est - 4096) / factor, 65536 / factor); if (physmem_est > 65536) nbuf += min((physmem_est - 65536) * 2 / (factor * 5), 32 * 1024 * 1024 / (factor * 5)); if (maxbcache && nbuf > maxbcache / BKVASIZE) nbuf = maxbcache / BKVASIZE; tuned_nbuf = 1; } else tuned_nbuf = 0; /* XXX Avoid unsigned long overflows later on with maxbufspace. */ maxbuf = (LONG_MAX / 3) / BKVASIZE; if (nbuf > maxbuf) { if (!tuned_nbuf) printf("Warning: nbufs lowered from %d to %ld\n", nbuf, maxbuf); nbuf = maxbuf; } /* * Ideal allocation size for the transient bio submap is 10% * of the maximal space buffer map. This roughly corresponds * to the amount of the buffer mapped for typical UFS load. * * Clip the buffer map to reserve space for the transient * BIOs, if its extent is bigger than 90% (80% on i386) of the * maximum buffer map extent on the platform. * * The fall-back to the maxbuf in case of maxbcache unset, * allows to not trim the buffer KVA for the architectures * with ample KVA space. */ if (bio_transient_maxcnt == 0 && unmapped_buf_allowed) { maxbuf_sz = maxbcache != 0 ? maxbcache : maxbuf * BKVASIZE; buf_sz = (long)nbuf * BKVASIZE; if (buf_sz < maxbuf_sz / TRANSIENT_DENOM * (TRANSIENT_DENOM - 1)) { /* * There is more KVA than memory. Do not * adjust buffer map size, and assign the rest * of maxbuf to transient map. */ biotmap_sz = maxbuf_sz - buf_sz; } else { /* * Buffer map spans all KVA we could afford on * this platform. Give 10% (20% on i386) of * the buffer map to the transient bio map. */ biotmap_sz = buf_sz / TRANSIENT_DENOM; buf_sz -= biotmap_sz; } if (biotmap_sz / INT_MAX > MAXPHYS) bio_transient_maxcnt = INT_MAX; else bio_transient_maxcnt = biotmap_sz / MAXPHYS; /* * Artificially limit to 1024 simultaneous in-flight I/Os * using the transient mapping. */ if (bio_transient_maxcnt > 1024) bio_transient_maxcnt = 1024; if (tuned_nbuf) nbuf = buf_sz / BKVASIZE; } if (nswbuf == 0) { nswbuf = min(nbuf / 4, 256); if (nswbuf < NSWBUF_MIN) nswbuf = NSWBUF_MIN; } /* * Reserve space for the buffer cache buffers */ buf = (void *)v; v = (caddr_t)(buf + nbuf); return(v); } /* Initialize the buffer subsystem. Called before use of any buffers. */ void bufinit(void) { struct buf *bp; int i; KASSERT(maxbcachebuf >= MAXBSIZE, ("maxbcachebuf (%d) must be >= MAXBSIZE (%d)\n", maxbcachebuf, MAXBSIZE)); bq_init(&bqempty, QUEUE_EMPTY, -1, "bufq empty lock"); mtx_init(&rbreqlock, "runningbufspace lock", NULL, MTX_DEF); mtx_init(&bdlock, "buffer daemon lock", NULL, MTX_DEF); mtx_init(&bdirtylock, "dirty buf lock", NULL, MTX_DEF); unmapped_buf = (caddr_t)kva_alloc(MAXPHYS); /* finally, initialize each buffer header and stick on empty q */ for (i = 0; i < nbuf; i++) { bp = &buf[i]; bzero(bp, sizeof *bp); bp->b_flags = B_INVAL; bp->b_rcred = NOCRED; bp->b_wcred = NOCRED; bp->b_qindex = QUEUE_NONE; bp->b_domain = -1; bp->b_subqueue = mp_maxid + 1; bp->b_xflags = 0; bp->b_data = bp->b_kvabase = unmapped_buf; LIST_INIT(&bp->b_dep); BUF_LOCKINIT(bp); bq_insert(&bqempty, bp, false); } /* * maxbufspace is the absolute maximum amount of buffer space we are * allowed to reserve in KVM and in real terms. The absolute maximum * is nominally used by metadata. hibufspace is the nominal maximum * used by most other requests. The differential is required to * ensure that metadata deadlocks don't occur. * * maxbufspace is based on BKVASIZE. Allocating buffers larger then * this may result in KVM fragmentation which is not handled optimally * by the system. XXX This is less true with vmem. We could use * PAGE_SIZE. */ maxbufspace = (long)nbuf * BKVASIZE; hibufspace = lmax(3 * maxbufspace / 4, maxbufspace - maxbcachebuf * 10); lobufspace = (hibufspace / 20) * 19; /* 95% */ bufspacethresh = lobufspace + (hibufspace - lobufspace) / 2; /* * Note: The 16 MiB upper limit for hirunningspace was chosen * arbitrarily and may need further tuning. It corresponds to * 128 outstanding write IO requests (if IO size is 128 KiB), * which fits with many RAID controllers' tagged queuing limits. * The lower 1 MiB limit is the historical upper limit for * hirunningspace. */ hirunningspace = lmax(lmin(roundup(hibufspace / 64, maxbcachebuf), 16 * 1024 * 1024), 1024 * 1024); lorunningspace = roundup((hirunningspace * 2) / 3, maxbcachebuf); /* * Limit the amount of malloc memory since it is wired permanently into * the kernel space. Even though this is accounted for in the buffer * allocation, we don't want the malloced region to grow uncontrolled. * The malloc scheme improves memory utilization significantly on * average (small) directories. */ maxbufmallocspace = hibufspace / 20; /* * Reduce the chance of a deadlock occurring by limiting the number * of delayed-write dirty buffers we allow to stack up. */ hidirtybuffers = nbuf / 4 + 20; dirtybufthresh = hidirtybuffers * 9 / 10; /* * To support extreme low-memory systems, make sure hidirtybuffers * cannot eat up all available buffer space. This occurs when our * minimum cannot be met. We try to size hidirtybuffers to 3/4 our * buffer space assuming BKVASIZE'd buffers. */ while ((long)hidirtybuffers * BKVASIZE > 3 * hibufspace / 4) { hidirtybuffers >>= 1; } lodirtybuffers = hidirtybuffers / 2; /* * lofreebuffers should be sufficient to avoid stalling waiting on * buf headers under heavy utilization. The bufs in per-cpu caches * are counted as free but will be unavailable to threads executing * on other cpus. * * hifreebuffers is the free target for the bufspace daemon. This * should be set appropriately to limit work per-iteration. */ lofreebuffers = MIN((nbuf / 25) + (20 * mp_ncpus), 128 * mp_ncpus); hifreebuffers = (3 * lofreebuffers) / 2; numfreebuffers = nbuf; /* Setup the kva and free list allocators. */ vmem_set_reclaim(buffer_arena, bufkva_reclaim); buf_zone = uma_zcache_create("buf free cache", sizeof(struct buf), NULL, NULL, NULL, NULL, buf_import, buf_release, NULL, 0); /* * Size the clean queue according to the amount of buffer space. * One queue per-256mb up to the max. More queues gives better * concurrency but less accurate LRU. */ buf_domains = MIN(howmany(maxbufspace, 256*1024*1024), BUF_DOMAINS); for (i = 0 ; i < buf_domains; i++) { struct bufdomain *bd; bd = &bdomain[i]; bd_init(bd); bd->bd_freebuffers = nbuf / buf_domains; bd->bd_hifreebuffers = hifreebuffers / buf_domains; bd->bd_lofreebuffers = lofreebuffers / buf_domains; bd->bd_bufspace = 0; bd->bd_maxbufspace = maxbufspace / buf_domains; bd->bd_hibufspace = hibufspace / buf_domains; bd->bd_lobufspace = lobufspace / buf_domains; bd->bd_bufspacethresh = bufspacethresh / buf_domains; bd->bd_numdirtybuffers = 0; bd->bd_hidirtybuffers = hidirtybuffers / buf_domains; bd->bd_lodirtybuffers = lodirtybuffers / buf_domains; bd->bd_dirtybufthresh = dirtybufthresh / buf_domains; /* Don't allow more than 2% of bufs in the per-cpu caches. */ bd->bd_lim = nbuf / buf_domains / 50 / mp_ncpus; } getnewbufcalls = counter_u64_alloc(M_WAITOK); getnewbufrestarts = counter_u64_alloc(M_WAITOK); mappingrestarts = counter_u64_alloc(M_WAITOK); numbufallocfails = counter_u64_alloc(M_WAITOK); notbufdflushes = counter_u64_alloc(M_WAITOK); buffreekvacnt = counter_u64_alloc(M_WAITOK); bufdefragcnt = counter_u64_alloc(M_WAITOK); bufkvaspace = counter_u64_alloc(M_WAITOK); } #ifdef INVARIANTS static inline void vfs_buf_check_mapped(struct buf *bp) { KASSERT(bp->b_kvabase != unmapped_buf, ("mapped buf: b_kvabase was not updated %p", bp)); KASSERT(bp->b_data != unmapped_buf, ("mapped buf: b_data was not updated %p", bp)); KASSERT(bp->b_data < unmapped_buf || bp->b_data >= unmapped_buf + MAXPHYS, ("b_data + b_offset unmapped %p", bp)); } static inline void vfs_buf_check_unmapped(struct buf *bp) { KASSERT(bp->b_data == unmapped_buf, ("unmapped buf: corrupted b_data %p", bp)); } #define BUF_CHECK_MAPPED(bp) vfs_buf_check_mapped(bp) #define BUF_CHECK_UNMAPPED(bp) vfs_buf_check_unmapped(bp) #else #define BUF_CHECK_MAPPED(bp) do {} while (0) #define BUF_CHECK_UNMAPPED(bp) do {} while (0) #endif static int isbufbusy(struct buf *bp) { if (((bp->b_flags & B_INVAL) == 0 && BUF_ISLOCKED(bp)) || ((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI)) return (1); return (0); } /* * Shutdown the system cleanly to prepare for reboot, halt, or power off. */ void bufshutdown(int show_busybufs) { static int first_buf_printf = 1; struct buf *bp; int iter, nbusy, pbusy; #ifndef PREEMPTION int subiter; #endif /* * Sync filesystems for shutdown */ wdog_kern_pat(WD_LASTVAL); sys_sync(curthread, NULL); /* * With soft updates, some buffers that are * written will be remarked as dirty until other * buffers are written. */ for (iter = pbusy = 0; iter < 20; iter++) { nbusy = 0; for (bp = &buf[nbuf]; --bp >= buf; ) if (isbufbusy(bp)) nbusy++; if (nbusy == 0) { if (first_buf_printf) printf("All buffers synced."); break; } if (first_buf_printf) { printf("Syncing disks, buffers remaining... "); first_buf_printf = 0; } printf("%d ", nbusy); if (nbusy < pbusy) iter = 0; pbusy = nbusy; wdog_kern_pat(WD_LASTVAL); sys_sync(curthread, NULL); #ifdef PREEMPTION /* * Spin for a while to allow interrupt threads to run. */ DELAY(50000 * iter); #else /* * Context switch several times to allow interrupt * threads to run. */ for (subiter = 0; subiter < 50 * iter; subiter++) { thread_lock(curthread); mi_switch(SW_VOL, NULL); thread_unlock(curthread); DELAY(1000); } #endif } printf("\n"); /* * Count only busy local buffers to prevent forcing * a fsck if we're just a client of a wedged NFS server */ nbusy = 0; for (bp = &buf[nbuf]; --bp >= buf; ) { if (isbufbusy(bp)) { #if 0 /* XXX: This is bogus. We should probably have a BO_REMOTE flag instead */ if (bp->b_dev == NULL) { TAILQ_REMOVE(&mountlist, bp->b_vp->v_mount, mnt_list); continue; } #endif nbusy++; if (show_busybufs > 0) { printf( "%d: buf:%p, vnode:%p, flags:%0x, blkno:%jd, lblkno:%jd, buflock:", nbusy, bp, bp->b_vp, bp->b_flags, (intmax_t)bp->b_blkno, (intmax_t)bp->b_lblkno); BUF_LOCKPRINTINFO(bp); if (show_busybufs > 1) vn_printf(bp->b_vp, "vnode content: "); } } } if (nbusy) { /* * Failed to sync all blocks. Indicate this and don't * unmount filesystems (thus forcing an fsck on reboot). */ printf("Giving up on %d buffers\n", nbusy); DELAY(5000000); /* 5 seconds */ } else { if (!first_buf_printf) printf("Final sync complete\n"); /* * Unmount filesystems */ if (panicstr == NULL) vfs_unmountall(); } swapoff_all(); DELAY(100000); /* wait for console output to finish */ } static void bpmap_qenter(struct buf *bp) { BUF_CHECK_MAPPED(bp); /* * bp->b_data is relative to bp->b_offset, but * bp->b_offset may be offset into the first page. */ bp->b_data = (caddr_t)trunc_page((vm_offset_t)bp->b_data); pmap_qenter((vm_offset_t)bp->b_data, bp->b_pages, bp->b_npages); bp->b_data = (caddr_t)((vm_offset_t)bp->b_data | (vm_offset_t)(bp->b_offset & PAGE_MASK)); } static inline struct bufdomain * bufdomain(struct buf *bp) { return (&bdomain[bp->b_domain]); } static struct bufqueue * bufqueue(struct buf *bp) { switch (bp->b_qindex) { case QUEUE_NONE: /* FALLTHROUGH */ case QUEUE_SENTINEL: return (NULL); case QUEUE_EMPTY: return (&bqempty); case QUEUE_DIRTY: return (&bufdomain(bp)->bd_dirtyq); case QUEUE_CLEAN: return (&bufdomain(bp)->bd_subq[bp->b_subqueue]); default: break; } panic("bufqueue(%p): Unhandled type %d\n", bp, bp->b_qindex); } /* * Return the locked bufqueue that bp is a member of. */ static struct bufqueue * bufqueue_acquire(struct buf *bp) { struct bufqueue *bq, *nbq; /* * bp can be pushed from a per-cpu queue to the * cleanq while we're waiting on the lock. Retry * if the queues don't match. */ bq = bufqueue(bp); BQ_LOCK(bq); for (;;) { nbq = bufqueue(bp); if (bq == nbq) break; BQ_UNLOCK(bq); BQ_LOCK(nbq); bq = nbq; } return (bq); } /* * binsfree: * * Insert the buffer into the appropriate free list. Requires a * locked buffer on entry and buffer is unlocked before return. */ static void binsfree(struct buf *bp, int qindex) { struct bufdomain *bd; struct bufqueue *bq; KASSERT(qindex == QUEUE_CLEAN || qindex == QUEUE_DIRTY, ("binsfree: Invalid qindex %d", qindex)); BUF_ASSERT_XLOCKED(bp); /* * Handle delayed bremfree() processing. */ if (bp->b_flags & B_REMFREE) { if (bp->b_qindex == qindex) { bp->b_flags |= B_REUSE; bp->b_flags &= ~B_REMFREE; BUF_UNLOCK(bp); return; } bq = bufqueue_acquire(bp); bq_remove(bq, bp); BQ_UNLOCK(bq); } bd = bufdomain(bp); if (qindex == QUEUE_CLEAN) { if (bd->bd_lim != 0) bq = &bd->bd_subq[PCPU_GET(cpuid)]; else bq = bd->bd_cleanq; } else bq = &bd->bd_dirtyq; bq_insert(bq, bp, true); } /* * buf_free: * * Free a buffer to the buf zone once it no longer has valid contents. */ static void buf_free(struct buf *bp) { if (bp->b_flags & B_REMFREE) bremfreef(bp); if (bp->b_vflags & BV_BKGRDINPROG) panic("losing buffer 1"); if (bp->b_rcred != NOCRED) { crfree(bp->b_rcred); bp->b_rcred = NOCRED; } if (bp->b_wcred != NOCRED) { crfree(bp->b_wcred); bp->b_wcred = NOCRED; } if (!LIST_EMPTY(&bp->b_dep)) buf_deallocate(bp); bufkva_free(bp); atomic_add_int(&bufdomain(bp)->bd_freebuffers, 1); BUF_UNLOCK(bp); uma_zfree(buf_zone, bp); } /* * buf_import: * * Import bufs into the uma cache from the buf list. The system still * expects a static array of bufs and much of the synchronization * around bufs assumes type stable storage. As a result, UMA is used * only as a per-cpu cache of bufs still maintained on a global list. */ static int buf_import(void *arg, void **store, int cnt, int domain, int flags) { struct buf *bp; int i; BQ_LOCK(&bqempty); for (i = 0; i < cnt; i++) { bp = TAILQ_FIRST(&bqempty.bq_queue); if (bp == NULL) break; bq_remove(&bqempty, bp); store[i] = bp; } BQ_UNLOCK(&bqempty); return (i); } /* * buf_release: * * Release bufs from the uma cache back to the buffer queues. */ static void buf_release(void *arg, void **store, int cnt) { struct bufqueue *bq; struct buf *bp; int i; bq = &bqempty; BQ_LOCK(bq); for (i = 0; i < cnt; i++) { bp = store[i]; /* Inline bq_insert() to batch locking. */ TAILQ_INSERT_TAIL(&bq->bq_queue, bp, b_freelist); bp->b_flags &= ~(B_AGE | B_REUSE); bq->bq_len++; bp->b_qindex = bq->bq_index; } BQ_UNLOCK(bq); } /* * buf_alloc: * * Allocate an empty buffer header. */ static struct buf * buf_alloc(struct bufdomain *bd) { struct buf *bp; int freebufs; /* * We can only run out of bufs in the buf zone if the average buf * is less than BKVASIZE. In this case the actual wait/block will * come from buf_reycle() failing to flush one of these small bufs. */ bp = NULL; freebufs = atomic_fetchadd_int(&bd->bd_freebuffers, -1); if (freebufs > 0) bp = uma_zalloc(buf_zone, M_NOWAIT); if (bp == NULL) { atomic_add_int(&bd->bd_freebuffers, 1); bufspace_daemon_wakeup(bd); counter_u64_add(numbufallocfails, 1); return (NULL); } /* * Wake-up the bufspace daemon on transition below threshold. */ if (freebufs == bd->bd_lofreebuffers) bufspace_daemon_wakeup(bd); if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0) panic("getnewbuf_empty: Locked buf %p on free queue.", bp); KASSERT(bp->b_vp == NULL, ("bp: %p still has vnode %p.", bp, bp->b_vp)); KASSERT((bp->b_flags & (B_DELWRI | B_NOREUSE)) == 0, ("invalid buffer %p flags %#x", bp, bp->b_flags)); KASSERT((bp->b_xflags & (BX_VNCLEAN|BX_VNDIRTY)) == 0, ("bp: %p still on a buffer list. xflags %X", bp, bp->b_xflags)); KASSERT(bp->b_npages == 0, ("bp: %p still has %d vm pages\n", bp, bp->b_npages)); KASSERT(bp->b_kvasize == 0, ("bp: %p still has kva\n", bp)); KASSERT(bp->b_bufsize == 0, ("bp: %p still has bufspace\n", bp)); bp->b_domain = BD_DOMAIN(bd); bp->b_flags = 0; bp->b_ioflags = 0; bp->b_xflags = 0; bp->b_vflags = 0; bp->b_vp = NULL; bp->b_blkno = bp->b_lblkno = 0; bp->b_offset = NOOFFSET; bp->b_iodone = 0; bp->b_error = 0; bp->b_resid = 0; bp->b_bcount = 0; bp->b_npages = 0; bp->b_dirtyoff = bp->b_dirtyend = 0; bp->b_bufobj = NULL; bp->b_data = bp->b_kvabase = unmapped_buf; bp->b_fsprivate1 = NULL; bp->b_fsprivate2 = NULL; bp->b_fsprivate3 = NULL; LIST_INIT(&bp->b_dep); return (bp); } /* * buf_recycle: * * Free a buffer from the given bufqueue. kva controls whether the * freed buf must own some kva resources. This is used for * defragmenting. */ static int buf_recycle(struct bufdomain *bd, bool kva) { struct bufqueue *bq; struct buf *bp, *nbp; if (kva) counter_u64_add(bufdefragcnt, 1); nbp = NULL; bq = bd->bd_cleanq; BQ_LOCK(bq); KASSERT(BQ_LOCKPTR(bq) == BD_LOCKPTR(bd), ("buf_recycle: Locks don't match")); nbp = TAILQ_FIRST(&bq->bq_queue); /* * Run scan, possibly freeing data and/or kva mappings on the fly * depending. */ while ((bp = nbp) != NULL) { /* * Calculate next bp (we can only use it if we do not * release the bqlock). */ nbp = TAILQ_NEXT(bp, b_freelist); /* * If we are defragging then we need a buffer with * some kva to reclaim. */ if (kva && bp->b_kvasize == 0) continue; if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0) continue; /* * Implement a second chance algorithm for frequently * accessed buffers. */ if ((bp->b_flags & B_REUSE) != 0) { TAILQ_REMOVE(&bq->bq_queue, bp, b_freelist); TAILQ_INSERT_TAIL(&bq->bq_queue, bp, b_freelist); bp->b_flags &= ~B_REUSE; BUF_UNLOCK(bp); continue; } /* * Skip buffers with background writes in progress. */ if ((bp->b_vflags & BV_BKGRDINPROG) != 0) { BUF_UNLOCK(bp); continue; } KASSERT(bp->b_qindex == QUEUE_CLEAN, ("buf_recycle: inconsistent queue %d bp %p", bp->b_qindex, bp)); KASSERT(bp->b_domain == BD_DOMAIN(bd), ("getnewbuf: queue domain %d doesn't match request %d", bp->b_domain, (int)BD_DOMAIN(bd))); /* * NOTE: nbp is now entirely invalid. We can only restart * the scan from this point on. */ bq_remove(bq, bp); BQ_UNLOCK(bq); /* * Requeue the background write buffer with error and * restart the scan. */ if ((bp->b_vflags & BV_BKGRDERR) != 0) { bqrelse(bp); BQ_LOCK(bq); nbp = TAILQ_FIRST(&bq->bq_queue); continue; } bp->b_flags |= B_INVAL; brelse(bp); return (0); } bd->bd_wanted = 1; BQ_UNLOCK(bq); return (ENOBUFS); } /* * bremfree: * * Mark the buffer for removal from the appropriate free list. * */ void bremfree(struct buf *bp) { CTR3(KTR_BUF, "bremfree(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); KASSERT((bp->b_flags & B_REMFREE) == 0, ("bremfree: buffer %p already marked for delayed removal.", bp)); KASSERT(bp->b_qindex != QUEUE_NONE, ("bremfree: buffer %p not on a queue.", bp)); BUF_ASSERT_XLOCKED(bp); bp->b_flags |= B_REMFREE; } /* * bremfreef: * * Force an immediate removal from a free list. Used only in nfs when * it abuses the b_freelist pointer. */ void bremfreef(struct buf *bp) { struct bufqueue *bq; bq = bufqueue_acquire(bp); bq_remove(bq, bp); BQ_UNLOCK(bq); } static void bq_init(struct bufqueue *bq, int qindex, int subqueue, const char *lockname) { mtx_init(&bq->bq_lock, lockname, NULL, MTX_DEF); TAILQ_INIT(&bq->bq_queue); bq->bq_len = 0; bq->bq_index = qindex; bq->bq_subqueue = subqueue; } static void bd_init(struct bufdomain *bd) { int i; bd->bd_cleanq = &bd->bd_subq[mp_maxid + 1]; bq_init(bd->bd_cleanq, QUEUE_CLEAN, mp_maxid + 1, "bufq clean lock"); bq_init(&bd->bd_dirtyq, QUEUE_DIRTY, -1, "bufq dirty lock"); for (i = 0; i <= mp_maxid; i++) bq_init(&bd->bd_subq[i], QUEUE_CLEAN, i, "bufq clean subqueue lock"); mtx_init(&bd->bd_run_lock, "bufspace daemon run lock", NULL, MTX_DEF); } /* * bq_remove: * * Removes a buffer from the free list, must be called with the * correct qlock held. */ static void bq_remove(struct bufqueue *bq, struct buf *bp) { CTR3(KTR_BUF, "bq_remove(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); KASSERT(bp->b_qindex != QUEUE_NONE, ("bq_remove: buffer %p not on a queue.", bp)); KASSERT(bufqueue(bp) == bq, ("bq_remove: Remove buffer %p from wrong queue.", bp)); BQ_ASSERT_LOCKED(bq); if (bp->b_qindex != QUEUE_EMPTY) { BUF_ASSERT_XLOCKED(bp); } KASSERT(bq->bq_len >= 1, ("queue %d underflow", bp->b_qindex)); TAILQ_REMOVE(&bq->bq_queue, bp, b_freelist); bq->bq_len--; bp->b_qindex = QUEUE_NONE; bp->b_flags &= ~(B_REMFREE | B_REUSE); } static void bd_flush(struct bufdomain *bd, struct bufqueue *bq) { struct buf *bp; BQ_ASSERT_LOCKED(bq); if (bq != bd->bd_cleanq) { BD_LOCK(bd); while ((bp = TAILQ_FIRST(&bq->bq_queue)) != NULL) { TAILQ_REMOVE(&bq->bq_queue, bp, b_freelist); TAILQ_INSERT_TAIL(&bd->bd_cleanq->bq_queue, bp, b_freelist); bp->b_subqueue = bd->bd_cleanq->bq_subqueue; } bd->bd_cleanq->bq_len += bq->bq_len; bq->bq_len = 0; } if (bd->bd_wanted) { bd->bd_wanted = 0; wakeup(&bd->bd_wanted); } if (bq != bd->bd_cleanq) BD_UNLOCK(bd); } static int bd_flushall(struct bufdomain *bd) { struct bufqueue *bq; int flushed; int i; if (bd->bd_lim == 0) return (0); flushed = 0; for (i = 0; i <= mp_maxid; i++) { bq = &bd->bd_subq[i]; if (bq->bq_len == 0) continue; BQ_LOCK(bq); bd_flush(bd, bq); BQ_UNLOCK(bq); flushed++; } return (flushed); } static void bq_insert(struct bufqueue *bq, struct buf *bp, bool unlock) { struct bufdomain *bd; if (bp->b_qindex != QUEUE_NONE) panic("bq_insert: free buffer %p onto another queue?", bp); bd = bufdomain(bp); if (bp->b_flags & B_AGE) { /* Place this buf directly on the real queue. */ if (bq->bq_index == QUEUE_CLEAN) bq = bd->bd_cleanq; BQ_LOCK(bq); TAILQ_INSERT_HEAD(&bq->bq_queue, bp, b_freelist); } else { BQ_LOCK(bq); TAILQ_INSERT_TAIL(&bq->bq_queue, bp, b_freelist); } bp->b_flags &= ~(B_AGE | B_REUSE); bq->bq_len++; bp->b_qindex = bq->bq_index; bp->b_subqueue = bq->bq_subqueue; /* * Unlock before we notify so that we don't wakeup a waiter that * fails a trylock on the buf and sleeps again. */ if (unlock) BUF_UNLOCK(bp); if (bp->b_qindex == QUEUE_CLEAN) { /* * Flush the per-cpu queue and notify any waiters. */ if (bd->bd_wanted || (bq != bd->bd_cleanq && bq->bq_len >= bd->bd_lim)) bd_flush(bd, bq); } BQ_UNLOCK(bq); } /* * bufkva_free: * * Free the kva allocation for a buffer. * */ static void bufkva_free(struct buf *bp) { #ifdef INVARIANTS if (bp->b_kvasize == 0) { KASSERT(bp->b_kvabase == unmapped_buf && bp->b_data == unmapped_buf, ("Leaked KVA space on %p", bp)); } else if (buf_mapped(bp)) BUF_CHECK_MAPPED(bp); else BUF_CHECK_UNMAPPED(bp); #endif if (bp->b_kvasize == 0) return; vmem_free(buffer_arena, (vm_offset_t)bp->b_kvabase, bp->b_kvasize); counter_u64_add(bufkvaspace, -bp->b_kvasize); counter_u64_add(buffreekvacnt, 1); bp->b_data = bp->b_kvabase = unmapped_buf; bp->b_kvasize = 0; } /* * bufkva_alloc: * * Allocate the buffer KVA and set b_kvasize and b_kvabase. */ static int bufkva_alloc(struct buf *bp, int maxsize, int gbflags) { vm_offset_t addr; int error; KASSERT((gbflags & GB_UNMAPPED) == 0 || (gbflags & GB_KVAALLOC) != 0, ("Invalid gbflags 0x%x in %s", gbflags, __func__)); bufkva_free(bp); addr = 0; error = vmem_alloc(buffer_arena, maxsize, M_BESTFIT | M_NOWAIT, &addr); if (error != 0) { /* * Buffer map is too fragmented. Request the caller * to defragment the map. */ return (error); } bp->b_kvabase = (caddr_t)addr; bp->b_kvasize = maxsize; counter_u64_add(bufkvaspace, bp->b_kvasize); if ((gbflags & GB_UNMAPPED) != 0) { bp->b_data = unmapped_buf; BUF_CHECK_UNMAPPED(bp); } else { bp->b_data = bp->b_kvabase; BUF_CHECK_MAPPED(bp); } return (0); } /* * bufkva_reclaim: * * Reclaim buffer kva by freeing buffers holding kva. This is a vmem * callback that fires to avoid returning failure. */ static void bufkva_reclaim(vmem_t *vmem, int flags) { bool done; int q; int i; done = false; for (i = 0; i < 5; i++) { for (q = 0; q < buf_domains; q++) if (buf_recycle(&bdomain[q], true) != 0) done = true; if (done) break; } return; } /* * Attempt to initiate asynchronous I/O on read-ahead blocks. We must * clear BIO_ERROR and B_INVAL prior to initiating I/O . If B_CACHE is set, * the buffer is valid and we do not have to do anything. */ static void breada(struct vnode * vp, daddr_t * rablkno, int * rabsize, int cnt, struct ucred * cred, int flags, void (*ckhashfunc)(struct buf *)) { struct buf *rabp; struct thread *td; int i; td = curthread; for (i = 0; i < cnt; i++, rablkno++, rabsize++) { if (inmem(vp, *rablkno)) continue; rabp = getblk(vp, *rablkno, *rabsize, 0, 0, 0); if ((rabp->b_flags & B_CACHE) != 0) { brelse(rabp); continue; } #ifdef RACCT if (racct_enable) { PROC_LOCK(curproc); racct_add_buf(curproc, rabp, 0); PROC_UNLOCK(curproc); } #endif /* RACCT */ td->td_ru.ru_inblock++; rabp->b_flags |= B_ASYNC; rabp->b_flags &= ~B_INVAL; if ((flags & GB_CKHASH) != 0) { rabp->b_flags |= B_CKHASH; rabp->b_ckhashcalc = ckhashfunc; } rabp->b_ioflags &= ~BIO_ERROR; rabp->b_iocmd = BIO_READ; if (rabp->b_rcred == NOCRED && cred != NOCRED) rabp->b_rcred = crhold(cred); vfs_busy_pages(rabp, 0); BUF_KERNPROC(rabp); rabp->b_iooffset = dbtob(rabp->b_blkno); bstrategy(rabp); } } /* * Entry point for bread() and breadn() via #defines in sys/buf.h. * * Get a buffer with the specified data. Look in the cache first. We * must clear BIO_ERROR and B_INVAL prior to initiating I/O. If B_CACHE * is set, the buffer is valid and we do not have to do anything, see * getblk(). Also starts asynchronous I/O on read-ahead blocks. * * Always return a NULL buffer pointer (in bpp) when returning an error. */ int breadn_flags(struct vnode *vp, daddr_t blkno, int size, daddr_t *rablkno, int *rabsize, int cnt, struct ucred *cred, int flags, void (*ckhashfunc)(struct buf *), struct buf **bpp) { struct buf *bp; struct thread *td; int error, readwait, rv; CTR3(KTR_BUF, "breadn(%p, %jd, %d)", vp, blkno, size); td = curthread; /* * Can only return NULL if GB_LOCK_NOWAIT or GB_SPARSE flags * are specified. */ error = getblkx(vp, blkno, size, 0, 0, flags, &bp); if (error != 0) { *bpp = NULL; return (error); } flags &= ~GB_NOSPARSE; *bpp = bp; /* * If not found in cache, do some I/O */ readwait = 0; if ((bp->b_flags & B_CACHE) == 0) { #ifdef RACCT if (racct_enable) { PROC_LOCK(td->td_proc); racct_add_buf(td->td_proc, bp, 0); PROC_UNLOCK(td->td_proc); } #endif /* RACCT */ td->td_ru.ru_inblock++; bp->b_iocmd = BIO_READ; bp->b_flags &= ~B_INVAL; if ((flags & GB_CKHASH) != 0) { bp->b_flags |= B_CKHASH; bp->b_ckhashcalc = ckhashfunc; } bp->b_ioflags &= ~BIO_ERROR; if (bp->b_rcred == NOCRED && cred != NOCRED) bp->b_rcred = crhold(cred); vfs_busy_pages(bp, 0); bp->b_iooffset = dbtob(bp->b_blkno); bstrategy(bp); ++readwait; } /* * Attempt to initiate asynchronous I/O on read-ahead blocks. */ breada(vp, rablkno, rabsize, cnt, cred, flags, ckhashfunc); rv = 0; if (readwait) { rv = bufwait(bp); if (rv != 0) { brelse(bp); *bpp = NULL; } } return (rv); } /* * Write, release buffer on completion. (Done by iodone * if async). Do not bother writing anything if the buffer * is invalid. * * Note that we set B_CACHE here, indicating that buffer is * fully valid and thus cacheable. This is true even of NFS * now so we set it generally. This could be set either here * or in biodone() since the I/O is synchronous. We put it * here. */ int bufwrite(struct buf *bp) { int oldflags; struct vnode *vp; long space; int vp_md; CTR3(KTR_BUF, "bufwrite(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); if ((bp->b_bufobj->bo_flag & BO_DEAD) != 0) { bp->b_flags |= B_INVAL | B_RELBUF; bp->b_flags &= ~B_CACHE; brelse(bp); return (ENXIO); } if (bp->b_flags & B_INVAL) { brelse(bp); return (0); } if (bp->b_flags & B_BARRIER) atomic_add_long(&barrierwrites, 1); oldflags = bp->b_flags; KASSERT(!(bp->b_vflags & BV_BKGRDINPROG), ("FFS background buffer should not get here %p", bp)); vp = bp->b_vp; if (vp) vp_md = vp->v_vflag & VV_MD; else vp_md = 0; /* * Mark the buffer clean. Increment the bufobj write count * before bundirty() call, to prevent other thread from seeing * empty dirty list and zero counter for writes in progress, * falsely indicating that the bufobj is clean. */ bufobj_wref(bp->b_bufobj); bundirty(bp); bp->b_flags &= ~B_DONE; bp->b_ioflags &= ~BIO_ERROR; bp->b_flags |= B_CACHE; bp->b_iocmd = BIO_WRITE; vfs_busy_pages(bp, 1); /* * Normal bwrites pipeline writes */ bp->b_runningbufspace = bp->b_bufsize; space = atomic_fetchadd_long(&runningbufspace, bp->b_runningbufspace); #ifdef RACCT if (racct_enable) { PROC_LOCK(curproc); racct_add_buf(curproc, bp, 1); PROC_UNLOCK(curproc); } #endif /* RACCT */ curthread->td_ru.ru_oublock++; if (oldflags & B_ASYNC) BUF_KERNPROC(bp); bp->b_iooffset = dbtob(bp->b_blkno); buf_track(bp, __func__); bstrategy(bp); if ((oldflags & B_ASYNC) == 0) { int rtval = bufwait(bp); brelse(bp); return (rtval); } else if (space > hirunningspace) { /* * don't allow the async write to saturate the I/O * system. We will not deadlock here because * we are blocking waiting for I/O that is already in-progress * to complete. We do not block here if it is the update * or syncer daemon trying to clean up as that can lead * to deadlock. */ if ((curthread->td_pflags & TDP_NORUNNINGBUF) == 0 && !vp_md) waitrunningbufspace(); } return (0); } void bufbdflush(struct bufobj *bo, struct buf *bp) { struct buf *nbp; if (bo->bo_dirty.bv_cnt > dirtybufthresh + 10) { (void) VOP_FSYNC(bp->b_vp, MNT_NOWAIT, curthread); altbufferflushes++; } else if (bo->bo_dirty.bv_cnt > dirtybufthresh) { BO_LOCK(bo); /* * Try to find a buffer to flush. */ TAILQ_FOREACH(nbp, &bo->bo_dirty.bv_hd, b_bobufs) { if ((nbp->b_vflags & BV_BKGRDINPROG) || BUF_LOCK(nbp, LK_EXCLUSIVE | LK_NOWAIT, NULL)) continue; if (bp == nbp) panic("bdwrite: found ourselves"); BO_UNLOCK(bo); /* Don't countdeps with the bo lock held. */ if (buf_countdeps(nbp, 0)) { BO_LOCK(bo); BUF_UNLOCK(nbp); continue; } if (nbp->b_flags & B_CLUSTEROK) { vfs_bio_awrite(nbp); } else { bremfree(nbp); bawrite(nbp); } dirtybufferflushes++; break; } if (nbp == NULL) BO_UNLOCK(bo); } } /* * Delayed write. (Buffer is marked dirty). Do not bother writing * anything if the buffer is marked invalid. * * Note that since the buffer must be completely valid, we can safely * set B_CACHE. In fact, we have to set B_CACHE here rather then in * biodone() in order to prevent getblk from writing the buffer * out synchronously. */ void bdwrite(struct buf *bp) { struct thread *td = curthread; struct vnode *vp; struct bufobj *bo; CTR3(KTR_BUF, "bdwrite(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp)); KASSERT((bp->b_flags & B_BARRIER) == 0, ("Barrier request in delayed write %p", bp)); if (bp->b_flags & B_INVAL) { brelse(bp); return; } /* * If we have too many dirty buffers, don't create any more. * If we are wildly over our limit, then force a complete * cleanup. Otherwise, just keep the situation from getting * out of control. Note that we have to avoid a recursive * disaster and not try to clean up after our own cleanup! */ vp = bp->b_vp; bo = bp->b_bufobj; if ((td->td_pflags & (TDP_COWINPROGRESS|TDP_INBDFLUSH)) == 0) { td->td_pflags |= TDP_INBDFLUSH; BO_BDFLUSH(bo, bp); td->td_pflags &= ~TDP_INBDFLUSH; } else recursiveflushes++; bdirty(bp); /* * Set B_CACHE, indicating that the buffer is fully valid. This is * true even of NFS now. */ bp->b_flags |= B_CACHE; /* * This bmap keeps the system from needing to do the bmap later, * perhaps when the system is attempting to do a sync. Since it * is likely that the indirect block -- or whatever other datastructure * that the filesystem needs is still in memory now, it is a good * thing to do this. Note also, that if the pageout daemon is * requesting a sync -- there might not be enough memory to do * the bmap then... So, this is important to do. */ if (vp->v_type != VCHR && bp->b_lblkno == bp->b_blkno) { VOP_BMAP(vp, bp->b_lblkno, NULL, &bp->b_blkno, NULL, NULL); } buf_track(bp, __func__); /* * Set the *dirty* buffer range based upon the VM system dirty * pages. * * Mark the buffer pages as clean. We need to do this here to * satisfy the vnode_pager and the pageout daemon, so that it * thinks that the pages have been "cleaned". Note that since * the pages are in a delayed write buffer -- the VFS layer * "will" see that the pages get written out on the next sync, * or perhaps the cluster will be completed. */ vfs_clean_pages_dirty_buf(bp); bqrelse(bp); /* * note: we cannot initiate I/O from a bdwrite even if we wanted to, * due to the softdep code. */ } /* * bdirty: * * Turn buffer into delayed write request. We must clear BIO_READ and * B_RELBUF, and we must set B_DELWRI. We reassign the buffer to * itself to properly update it in the dirty/clean lists. We mark it * B_DONE to ensure that any asynchronization of the buffer properly * clears B_DONE ( else a panic will occur later ). * * bdirty() is kinda like bdwrite() - we have to clear B_INVAL which * might have been set pre-getblk(). Unlike bwrite/bdwrite, bdirty() * should only be called if the buffer is known-good. * * Since the buffer is not on a queue, we do not update the numfreebuffers * count. * * The buffer must be on QUEUE_NONE. */ void bdirty(struct buf *bp) { CTR3(KTR_BUF, "bdirty(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp)); KASSERT(bp->b_flags & B_REMFREE || bp->b_qindex == QUEUE_NONE, ("bdirty: buffer %p still on queue %d", bp, bp->b_qindex)); bp->b_flags &= ~(B_RELBUF); bp->b_iocmd = BIO_WRITE; if ((bp->b_flags & B_DELWRI) == 0) { bp->b_flags |= /* XXX B_DONE | */ B_DELWRI; reassignbuf(bp); bdirtyadd(bp); } } /* * bundirty: * * Clear B_DELWRI for buffer. * * Since the buffer is not on a queue, we do not update the numfreebuffers * count. * * The buffer must be on QUEUE_NONE. */ void bundirty(struct buf *bp) { CTR3(KTR_BUF, "bundirty(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp)); KASSERT(bp->b_flags & B_REMFREE || bp->b_qindex == QUEUE_NONE, ("bundirty: buffer %p still on queue %d", bp, bp->b_qindex)); if (bp->b_flags & B_DELWRI) { bp->b_flags &= ~B_DELWRI; reassignbuf(bp); bdirtysub(bp); } /* * Since it is now being written, we can clear its deferred write flag. */ bp->b_flags &= ~B_DEFERRED; } /* * bawrite: * * Asynchronous write. Start output on a buffer, but do not wait for * it to complete. The buffer is released when the output completes. * * bwrite() ( or the VOP routine anyway ) is responsible for handling * B_INVAL buffers. Not us. */ void bawrite(struct buf *bp) { bp->b_flags |= B_ASYNC; (void) bwrite(bp); } /* * babarrierwrite: * * Asynchronous barrier write. Start output on a buffer, but do not * wait for it to complete. Place a write barrier after this write so * that this buffer and all buffers written before it are committed to * the disk before any buffers written after this write are committed * to the disk. The buffer is released when the output completes. */ void babarrierwrite(struct buf *bp) { bp->b_flags |= B_ASYNC | B_BARRIER; (void) bwrite(bp); } /* * bbarrierwrite: * * Synchronous barrier write. Start output on a buffer and wait for * it to complete. Place a write barrier after this write so that * this buffer and all buffers written before it are committed to * the disk before any buffers written after this write are committed * to the disk. The buffer is released when the output completes. */ int bbarrierwrite(struct buf *bp) { bp->b_flags |= B_BARRIER; return (bwrite(bp)); } /* * bwillwrite: * * Called prior to the locking of any vnodes when we are expecting to * write. We do not want to starve the buffer cache with too many * dirty buffers so we block here. By blocking prior to the locking * of any vnodes we attempt to avoid the situation where a locked vnode * prevents the various system daemons from flushing related buffers. */ void bwillwrite(void) { if (buf_dirty_count_severe()) { mtx_lock(&bdirtylock); while (buf_dirty_count_severe()) { bdirtywait = 1; msleep(&bdirtywait, &bdirtylock, (PRIBIO + 4), "flswai", 0); } mtx_unlock(&bdirtylock); } } /* * Return true if we have too many dirty buffers. */ int buf_dirty_count_severe(void) { return (!BIT_EMPTY(BUF_DOMAINS, &bdhidirty)); } /* * brelse: * * Release a busy buffer and, if requested, free its resources. The * buffer will be stashed in the appropriate bufqueue[] allowing it * to be accessed later as a cache entity or reused for other purposes. */ void brelse(struct buf *bp) { struct mount *v_mnt; int qindex; /* * Many functions erroneously call brelse with a NULL bp under rare * error conditions. Simply return when called with a NULL bp. */ if (bp == NULL) return; CTR3(KTR_BUF, "brelse(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); KASSERT(!(bp->b_flags & (B_CLUSTER|B_PAGING)), ("brelse: inappropriate B_PAGING or B_CLUSTER bp %p", bp)); KASSERT((bp->b_flags & B_VMIO) != 0 || (bp->b_flags & B_NOREUSE) == 0, ("brelse: non-VMIO buffer marked NOREUSE")); if (BUF_LOCKRECURSED(bp)) { /* * Do not process, in particular, do not handle the * B_INVAL/B_RELBUF and do not release to free list. */ BUF_UNLOCK(bp); return; } if (bp->b_flags & B_MANAGED) { bqrelse(bp); return; } if ((bp->b_vflags & (BV_BKGRDINPROG | BV_BKGRDERR)) == BV_BKGRDERR) { BO_LOCK(bp->b_bufobj); bp->b_vflags &= ~BV_BKGRDERR; BO_UNLOCK(bp->b_bufobj); bdirty(bp); } + + if (bp->b_iocmd == BIO_WRITE && (bp->b_ioflags & BIO_ERROR) && + (bp->b_flags & B_INVALONERR)) { + /* + * Forced invalidation of dirty buffer contents, to be used + * after a failed write in the rare case that the loss of the + * contents is acceptable. The buffer is invalidated and + * freed. + */ + bp->b_flags |= B_INVAL | B_RELBUF | B_NOCACHE; + bp->b_flags &= ~(B_ASYNC | B_CACHE); + } + if (bp->b_iocmd == BIO_WRITE && (bp->b_ioflags & BIO_ERROR) && (bp->b_error != ENXIO || !LIST_EMPTY(&bp->b_dep)) && !(bp->b_flags & B_INVAL)) { /* * Failed write, redirty. All errors except ENXIO (which * means the device is gone) are treated as being * transient. * * XXX Treating EIO as transient is not correct; the * contract with the local storage device drivers is that * they will only return EIO once the I/O is no longer * retriable. Network I/O also respects this through the * guarantees of TCP and/or the internal retries of NFS. * ENOMEM might be transient, but we also have no way of * knowing when its ok to retry/reschedule. In general, * this entire case should be made obsolete through better * error handling/recovery and resource scheduling. * * Do this also for buffers that failed with ENXIO, but have * non-empty dependencies - the soft updates code might need * to access the buffer to untangle them. * * Must clear BIO_ERROR to prevent pages from being scrapped. */ bp->b_ioflags &= ~BIO_ERROR; bdirty(bp); } else if ((bp->b_flags & (B_NOCACHE | B_INVAL)) || (bp->b_ioflags & BIO_ERROR) || (bp->b_bufsize <= 0)) { /* * Either a failed read I/O, or we were asked to free or not * cache the buffer, or we failed to write to a device that's * no longer present. */ bp->b_flags |= B_INVAL; if (!LIST_EMPTY(&bp->b_dep)) buf_deallocate(bp); if (bp->b_flags & B_DELWRI) bdirtysub(bp); bp->b_flags &= ~(B_DELWRI | B_CACHE); if ((bp->b_flags & B_VMIO) == 0) { allocbuf(bp, 0); if (bp->b_vp) brelvp(bp); } } /* * We must clear B_RELBUF if B_DELWRI is set. If vfs_vmio_truncate() * is called with B_DELWRI set, the underlying pages may wind up * getting freed causing a previous write (bdwrite()) to get 'lost' * because pages associated with a B_DELWRI bp are marked clean. * * We still allow the B_INVAL case to call vfs_vmio_truncate(), even * if B_DELWRI is set. */ if (bp->b_flags & B_DELWRI) bp->b_flags &= ~B_RELBUF; /* * VMIO buffer rundown. It is not very necessary to keep a VMIO buffer * constituted, not even NFS buffers now. Two flags effect this. If * B_INVAL, the struct buf is invalidated but the VM object is kept * around ( i.e. so it is trivial to reconstitute the buffer later ). * * If BIO_ERROR or B_NOCACHE is set, pages in the VM object will be * invalidated. BIO_ERROR cannot be set for a failed write unless the * buffer is also B_INVAL because it hits the re-dirtying code above. * * Normally we can do this whether a buffer is B_DELWRI or not. If * the buffer is an NFS buffer, it is tracking piecemeal writes or * the commit state and we cannot afford to lose the buffer. If the * buffer has a background write in progress, we need to keep it * around to prevent it from being reconstituted and starting a second * background write. */ v_mnt = bp->b_vp != NULL ? bp->b_vp->v_mount : NULL; if ((bp->b_flags & B_VMIO) && (bp->b_flags & B_NOCACHE || (bp->b_ioflags & BIO_ERROR && bp->b_iocmd == BIO_READ)) && (v_mnt == NULL || (v_mnt->mnt_vfc->vfc_flags & VFCF_NETWORK) == 0 || vn_isdisk(bp->b_vp, NULL) || (bp->b_flags & B_DELWRI) == 0)) { vfs_vmio_invalidate(bp); allocbuf(bp, 0); } if ((bp->b_flags & (B_INVAL | B_RELBUF)) != 0 || (bp->b_flags & (B_DELWRI | B_NOREUSE)) == B_NOREUSE) { allocbuf(bp, 0); bp->b_flags &= ~B_NOREUSE; if (bp->b_vp != NULL) brelvp(bp); } /* * If the buffer has junk contents signal it and eventually * clean up B_DELWRI and diassociate the vnode so that gbincore() * doesn't find it. */ if (bp->b_bufsize == 0 || (bp->b_ioflags & BIO_ERROR) != 0 || (bp->b_flags & (B_INVAL | B_NOCACHE | B_RELBUF)) != 0) bp->b_flags |= B_INVAL; if (bp->b_flags & B_INVAL) { if (bp->b_flags & B_DELWRI) bundirty(bp); if (bp->b_vp) brelvp(bp); } buf_track(bp, __func__); /* buffers with no memory */ if (bp->b_bufsize == 0) { buf_free(bp); return; } /* buffers with junk contents */ if (bp->b_flags & (B_INVAL | B_NOCACHE | B_RELBUF) || (bp->b_ioflags & BIO_ERROR)) { bp->b_xflags &= ~(BX_BKGRDWRITE | BX_ALTDATA); if (bp->b_vflags & BV_BKGRDINPROG) panic("losing buffer 2"); qindex = QUEUE_CLEAN; bp->b_flags |= B_AGE; /* remaining buffers */ } else if (bp->b_flags & B_DELWRI) qindex = QUEUE_DIRTY; else qindex = QUEUE_CLEAN; if ((bp->b_flags & B_DELWRI) == 0 && (bp->b_xflags & BX_VNDIRTY)) panic("brelse: not dirty"); bp->b_flags &= ~(B_ASYNC | B_NOCACHE | B_RELBUF | B_DIRECT); /* binsfree unlocks bp. */ binsfree(bp, qindex); } /* * Release a buffer back to the appropriate queue but do not try to free * it. The buffer is expected to be used again soon. * * bqrelse() is used by bdwrite() to requeue a delayed write, and used by * biodone() to requeue an async I/O on completion. It is also used when * known good buffers need to be requeued but we think we may need the data * again soon. * * XXX we should be able to leave the B_RELBUF hint set on completion. */ void bqrelse(struct buf *bp) { int qindex; CTR3(KTR_BUF, "bqrelse(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); KASSERT(!(bp->b_flags & (B_CLUSTER|B_PAGING)), ("bqrelse: inappropriate B_PAGING or B_CLUSTER bp %p", bp)); qindex = QUEUE_NONE; if (BUF_LOCKRECURSED(bp)) { /* do not release to free list */ BUF_UNLOCK(bp); return; } bp->b_flags &= ~(B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF); if (bp->b_flags & B_MANAGED) { if (bp->b_flags & B_REMFREE) bremfreef(bp); goto out; } /* buffers with stale but valid contents */ if ((bp->b_flags & B_DELWRI) != 0 || (bp->b_vflags & (BV_BKGRDINPROG | BV_BKGRDERR)) == BV_BKGRDERR) { BO_LOCK(bp->b_bufobj); bp->b_vflags &= ~BV_BKGRDERR; BO_UNLOCK(bp->b_bufobj); qindex = QUEUE_DIRTY; } else { if ((bp->b_flags & B_DELWRI) == 0 && (bp->b_xflags & BX_VNDIRTY)) panic("bqrelse: not dirty"); if ((bp->b_flags & B_NOREUSE) != 0) { brelse(bp); return; } qindex = QUEUE_CLEAN; } buf_track(bp, __func__); /* binsfree unlocks bp. */ binsfree(bp, qindex); return; out: buf_track(bp, __func__); /* unlock */ BUF_UNLOCK(bp); } /* * Complete I/O to a VMIO backed page. Validate the pages as appropriate, * restore bogus pages. */ static void vfs_vmio_iodone(struct buf *bp) { vm_ooffset_t foff; vm_page_t m; vm_object_t obj; struct vnode *vp __unused; int i, iosize, resid; bool bogus; obj = bp->b_bufobj->bo_object; KASSERT(obj->paging_in_progress >= bp->b_npages, ("vfs_vmio_iodone: paging in progress(%d) < b_npages(%d)", obj->paging_in_progress, bp->b_npages)); vp = bp->b_vp; KASSERT(vp->v_holdcnt > 0, ("vfs_vmio_iodone: vnode %p has zero hold count", vp)); KASSERT(vp->v_object != NULL, ("vfs_vmio_iodone: vnode %p has no vm_object", vp)); foff = bp->b_offset; KASSERT(bp->b_offset != NOOFFSET, ("vfs_vmio_iodone: bp %p has no buffer offset", bp)); bogus = false; iosize = bp->b_bcount - bp->b_resid; VM_OBJECT_WLOCK(obj); for (i = 0; i < bp->b_npages; i++) { resid = ((foff + PAGE_SIZE) & ~(off_t)PAGE_MASK) - foff; if (resid > iosize) resid = iosize; /* * cleanup bogus pages, restoring the originals */ m = bp->b_pages[i]; if (m == bogus_page) { bogus = true; m = vm_page_lookup(obj, OFF_TO_IDX(foff)); if (m == NULL) panic("biodone: page disappeared!"); bp->b_pages[i] = m; } else if ((bp->b_iocmd == BIO_READ) && resid > 0) { /* * In the write case, the valid and clean bits are * already changed correctly ( see bdwrite() ), so we * only need to do this here in the read case. */ KASSERT((m->dirty & vm_page_bits(foff & PAGE_MASK, resid)) == 0, ("vfs_vmio_iodone: page %p " "has unexpected dirty bits", m)); vfs_page_set_valid(bp, foff, m); } KASSERT(OFF_TO_IDX(foff) == m->pindex, ("vfs_vmio_iodone: foff(%jd)/pindex(%ju) mismatch", (intmax_t)foff, (uintmax_t)m->pindex)); vm_page_sunbusy(m); foff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK; iosize -= resid; } vm_object_pip_wakeupn(obj, bp->b_npages); VM_OBJECT_WUNLOCK(obj); if (bogus && buf_mapped(bp)) { BUF_CHECK_MAPPED(bp); pmap_qenter(trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages); } } /* * Perform page invalidation when a buffer is released. The fully invalid * pages will be reclaimed later in vfs_vmio_truncate(). */ static void vfs_vmio_invalidate(struct buf *bp) { vm_object_t obj; vm_page_t m; int flags, i, resid, poffset, presid; if (buf_mapped(bp)) { BUF_CHECK_MAPPED(bp); pmap_qremove(trunc_page((vm_offset_t)bp->b_data), bp->b_npages); } else BUF_CHECK_UNMAPPED(bp); /* * Get the base offset and length of the buffer. Note that * in the VMIO case if the buffer block size is not * page-aligned then b_data pointer may not be page-aligned. * But our b_pages[] array *IS* page aligned. * * block sizes less then DEV_BSIZE (usually 512) are not * supported due to the page granularity bits (m->valid, * m->dirty, etc...). * * See man buf(9) for more information */ flags = (bp->b_flags & B_NOREUSE) != 0 ? VPR_NOREUSE : 0; obj = bp->b_bufobj->bo_object; resid = bp->b_bufsize; poffset = bp->b_offset & PAGE_MASK; VM_OBJECT_WLOCK(obj); for (i = 0; i < bp->b_npages; i++) { m = bp->b_pages[i]; if (m == bogus_page) panic("vfs_vmio_invalidate: Unexpected bogus page."); bp->b_pages[i] = NULL; presid = resid > (PAGE_SIZE - poffset) ? (PAGE_SIZE - poffset) : resid; KASSERT(presid >= 0, ("brelse: extra page")); while (vm_page_xbusied(m)) vm_page_sleep_if_xbusy(m, "mbncsh"); if (pmap_page_wired_mappings(m) == 0) vm_page_set_invalid(m, poffset, presid); vm_page_release_locked(m, flags); resid -= presid; poffset = 0; } VM_OBJECT_WUNLOCK(obj); bp->b_npages = 0; } /* * Page-granular truncation of an existing VMIO buffer. */ static void vfs_vmio_truncate(struct buf *bp, int desiredpages) { vm_object_t obj; vm_page_t m; int flags, i; if (bp->b_npages == desiredpages) return; if (buf_mapped(bp)) { BUF_CHECK_MAPPED(bp); pmap_qremove((vm_offset_t)trunc_page((vm_offset_t)bp->b_data) + (desiredpages << PAGE_SHIFT), bp->b_npages - desiredpages); } else BUF_CHECK_UNMAPPED(bp); /* * The object lock is needed only if we will attempt to free pages. */ flags = (bp->b_flags & B_NOREUSE) != 0 ? VPR_NOREUSE : 0; if ((bp->b_flags & B_DIRECT) != 0) { flags |= VPR_TRYFREE; obj = bp->b_bufobj->bo_object; VM_OBJECT_WLOCK(obj); } else { obj = NULL; } for (i = desiredpages; i < bp->b_npages; i++) { m = bp->b_pages[i]; KASSERT(m != bogus_page, ("allocbuf: bogus page found")); bp->b_pages[i] = NULL; if (obj != NULL) vm_page_release_locked(m, flags); else vm_page_release(m, flags); } if (obj != NULL) VM_OBJECT_WUNLOCK(obj); bp->b_npages = desiredpages; } /* * Byte granular extension of VMIO buffers. */ static void vfs_vmio_extend(struct buf *bp, int desiredpages, int size) { /* * We are growing the buffer, possibly in a * byte-granular fashion. */ vm_object_t obj; vm_offset_t toff; vm_offset_t tinc; vm_page_t m; /* * Step 1, bring in the VM pages from the object, allocating * them if necessary. We must clear B_CACHE if these pages * are not valid for the range covered by the buffer. */ obj = bp->b_bufobj->bo_object; VM_OBJECT_WLOCK(obj); if (bp->b_npages < desiredpages) { /* * We must allocate system pages since blocking * here could interfere with paging I/O, no * matter which process we are. * * Only exclusive busy can be tested here. * Blocking on shared busy might lead to * deadlocks once allocbuf() is called after * pages are vfs_busy_pages(). */ (void)vm_page_grab_pages(obj, OFF_TO_IDX(bp->b_offset) + bp->b_npages, VM_ALLOC_SYSTEM | VM_ALLOC_IGN_SBUSY | VM_ALLOC_NOBUSY | VM_ALLOC_WIRED, &bp->b_pages[bp->b_npages], desiredpages - bp->b_npages); bp->b_npages = desiredpages; } /* * Step 2. We've loaded the pages into the buffer, * we have to figure out if we can still have B_CACHE * set. Note that B_CACHE is set according to the * byte-granular range ( bcount and size ), not the * aligned range ( newbsize ). * * The VM test is against m->valid, which is DEV_BSIZE * aligned. Needless to say, the validity of the data * needs to also be DEV_BSIZE aligned. Note that this * fails with NFS if the server or some other client * extends the file's EOF. If our buffer is resized, * B_CACHE may remain set! XXX */ toff = bp->b_bcount; tinc = PAGE_SIZE - ((bp->b_offset + toff) & PAGE_MASK); while ((bp->b_flags & B_CACHE) && toff < size) { vm_pindex_t pi; if (tinc > (size - toff)) tinc = size - toff; pi = ((bp->b_offset & PAGE_MASK) + toff) >> PAGE_SHIFT; m = bp->b_pages[pi]; vfs_buf_test_cache(bp, bp->b_offset, toff, tinc, m); toff += tinc; tinc = PAGE_SIZE; } VM_OBJECT_WUNLOCK(obj); /* * Step 3, fixup the KVA pmap. */ if (buf_mapped(bp)) bpmap_qenter(bp); else BUF_CHECK_UNMAPPED(bp); } /* * Check to see if a block at a particular lbn is available for a clustered * write. */ static int vfs_bio_clcheck(struct vnode *vp, int size, daddr_t lblkno, daddr_t blkno) { struct buf *bpa; int match; match = 0; /* If the buf isn't in core skip it */ if ((bpa = gbincore(&vp->v_bufobj, lblkno)) == NULL) return (0); /* If the buf is busy we don't want to wait for it */ if (BUF_LOCK(bpa, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0) return (0); /* Only cluster with valid clusterable delayed write buffers */ if ((bpa->b_flags & (B_DELWRI | B_CLUSTEROK | B_INVAL)) != (B_DELWRI | B_CLUSTEROK)) goto done; if (bpa->b_bufsize != size) goto done; /* * Check to see if it is in the expected place on disk and that the * block has been mapped. */ if ((bpa->b_blkno != bpa->b_lblkno) && (bpa->b_blkno == blkno)) match = 1; done: BUF_UNLOCK(bpa); return (match); } /* * vfs_bio_awrite: * * Implement clustered async writes for clearing out B_DELWRI buffers. * This is much better then the old way of writing only one buffer at * a time. Note that we may not be presented with the buffers in the * correct order, so we search for the cluster in both directions. */ int vfs_bio_awrite(struct buf *bp) { struct bufobj *bo; int i; int j; daddr_t lblkno = bp->b_lblkno; struct vnode *vp = bp->b_vp; int ncl; int nwritten; int size; int maxcl; int gbflags; bo = &vp->v_bufobj; gbflags = (bp->b_data == unmapped_buf) ? GB_UNMAPPED : 0; /* * right now we support clustered writing only to regular files. If * we find a clusterable block we could be in the middle of a cluster * rather then at the beginning. */ if ((vp->v_type == VREG) && (vp->v_mount != 0) && /* Only on nodes that have the size info */ (bp->b_flags & (B_CLUSTEROK | B_INVAL)) == B_CLUSTEROK) { size = vp->v_mount->mnt_stat.f_iosize; maxcl = MAXPHYS / size; BO_RLOCK(bo); for (i = 1; i < maxcl; i++) if (vfs_bio_clcheck(vp, size, lblkno + i, bp->b_blkno + ((i * size) >> DEV_BSHIFT)) == 0) break; for (j = 1; i + j <= maxcl && j <= lblkno; j++) if (vfs_bio_clcheck(vp, size, lblkno - j, bp->b_blkno - ((j * size) >> DEV_BSHIFT)) == 0) break; BO_RUNLOCK(bo); --j; ncl = i + j; /* * this is a possible cluster write */ if (ncl != 1) { BUF_UNLOCK(bp); nwritten = cluster_wbuild(vp, size, lblkno - j, ncl, gbflags); return (nwritten); } } bremfree(bp); bp->b_flags |= B_ASYNC; /* * default (old) behavior, writing out only one block * * XXX returns b_bufsize instead of b_bcount for nwritten? */ nwritten = bp->b_bufsize; (void) bwrite(bp); return (nwritten); } /* * getnewbuf_kva: * * Allocate KVA for an empty buf header according to gbflags. */ static int getnewbuf_kva(struct buf *bp, int gbflags, int maxsize) { if ((gbflags & (GB_UNMAPPED | GB_KVAALLOC)) != GB_UNMAPPED) { /* * In order to keep fragmentation sane we only allocate kva * in BKVASIZE chunks. XXX with vmem we can do page size. */ maxsize = (maxsize + BKVAMASK) & ~BKVAMASK; if (maxsize != bp->b_kvasize && bufkva_alloc(bp, maxsize, gbflags)) return (ENOSPC); } return (0); } /* * getnewbuf: * * Find and initialize a new buffer header, freeing up existing buffers * in the bufqueues as necessary. The new buffer is returned locked. * * We block if: * We have insufficient buffer headers * We have insufficient buffer space * buffer_arena is too fragmented ( space reservation fails ) * If we have to flush dirty buffers ( but we try to avoid this ) * * The caller is responsible for releasing the reserved bufspace after * allocbuf() is called. */ static struct buf * getnewbuf(struct vnode *vp, int slpflag, int slptimeo, int maxsize, int gbflags) { struct bufdomain *bd; struct buf *bp; bool metadata, reserved; bp = NULL; KASSERT((gbflags & (GB_UNMAPPED | GB_KVAALLOC)) != GB_KVAALLOC, ("GB_KVAALLOC only makes sense with GB_UNMAPPED")); if (!unmapped_buf_allowed) gbflags &= ~(GB_UNMAPPED | GB_KVAALLOC); if (vp == NULL || (vp->v_vflag & (VV_MD | VV_SYSTEM)) != 0 || vp->v_type == VCHR) metadata = true; else metadata = false; if (vp == NULL) bd = &bdomain[0]; else bd = &bdomain[vp->v_bufobj.bo_domain]; counter_u64_add(getnewbufcalls, 1); reserved = false; do { if (reserved == false && bufspace_reserve(bd, maxsize, metadata) != 0) { counter_u64_add(getnewbufrestarts, 1); continue; } reserved = true; if ((bp = buf_alloc(bd)) == NULL) { counter_u64_add(getnewbufrestarts, 1); continue; } if (getnewbuf_kva(bp, gbflags, maxsize) == 0) return (bp); break; } while (buf_recycle(bd, false) == 0); if (reserved) bufspace_release(bd, maxsize); if (bp != NULL) { bp->b_flags |= B_INVAL; brelse(bp); } bufspace_wait(bd, vp, gbflags, slpflag, slptimeo); return (NULL); } /* * buf_daemon: * * buffer flushing daemon. Buffers are normally flushed by the * update daemon but if it cannot keep up this process starts to * take the load in an attempt to prevent getnewbuf() from blocking. */ static struct kproc_desc buf_kp = { "bufdaemon", buf_daemon, &bufdaemonproc }; SYSINIT(bufdaemon, SI_SUB_KTHREAD_BUF, SI_ORDER_FIRST, kproc_start, &buf_kp); static int buf_flush(struct vnode *vp, struct bufdomain *bd, int target) { int flushed; flushed = flushbufqueues(vp, bd, target, 0); if (flushed == 0) { /* * Could not find any buffers without rollback * dependencies, so just write the first one * in the hopes of eventually making progress. */ if (vp != NULL && target > 2) target /= 2; flushbufqueues(vp, bd, target, 1); } return (flushed); } static void buf_daemon() { struct bufdomain *bd; int speedupreq; int lodirty; int i; /* * This process needs to be suspended prior to shutdown sync. */ EVENTHANDLER_REGISTER(shutdown_pre_sync, kthread_shutdown, curthread, SHUTDOWN_PRI_LAST + 100); /* * Start the buf clean daemons as children threads. */ for (i = 0 ; i < buf_domains; i++) { int error; error = kthread_add((void (*)(void *))bufspace_daemon, &bdomain[i], curproc, NULL, 0, 0, "bufspacedaemon-%d", i); if (error) panic("error %d spawning bufspace daemon", error); } /* * This process is allowed to take the buffer cache to the limit */ curthread->td_pflags |= TDP_NORUNNINGBUF | TDP_BUFNEED; mtx_lock(&bdlock); for (;;) { bd_request = 0; mtx_unlock(&bdlock); kthread_suspend_check(); /* * Save speedupreq for this pass and reset to capture new * requests. */ speedupreq = bd_speedupreq; bd_speedupreq = 0; /* * Flush each domain sequentially according to its level and * the speedup request. */ for (i = 0; i < buf_domains; i++) { bd = &bdomain[i]; if (speedupreq) lodirty = bd->bd_numdirtybuffers / 2; else lodirty = bd->bd_lodirtybuffers; while (bd->bd_numdirtybuffers > lodirty) { if (buf_flush(NULL, bd, bd->bd_numdirtybuffers - lodirty) == 0) break; kern_yield(PRI_USER); } } /* * Only clear bd_request if we have reached our low water * mark. The buf_daemon normally waits 1 second and * then incrementally flushes any dirty buffers that have * built up, within reason. * * If we were unable to hit our low water mark and couldn't * find any flushable buffers, we sleep for a short period * to avoid endless loops on unlockable buffers. */ mtx_lock(&bdlock); if (!BIT_EMPTY(BUF_DOMAINS, &bdlodirty)) { /* * We reached our low water mark, reset the * request and sleep until we are needed again. * The sleep is just so the suspend code works. */ bd_request = 0; /* * Do an extra wakeup in case dirty threshold * changed via sysctl and the explicit transition * out of shortfall was missed. */ bdirtywakeup(); if (runningbufspace <= lorunningspace) runningwakeup(); msleep(&bd_request, &bdlock, PVM, "psleep", hz); } else { /* * We couldn't find any flushable dirty buffers but * still have too many dirty buffers, we * have to sleep and try again. (rare) */ msleep(&bd_request, &bdlock, PVM, "qsleep", hz / 10); } } } /* * flushbufqueues: * * Try to flush a buffer in the dirty queue. We must be careful to * free up B_INVAL buffers instead of write them, which NFS is * particularly sensitive to. */ static int flushwithdeps = 0; SYSCTL_INT(_vfs, OID_AUTO, flushwithdeps, CTLFLAG_RW, &flushwithdeps, 0, "Number of buffers flushed with dependecies that require rollbacks"); static int flushbufqueues(struct vnode *lvp, struct bufdomain *bd, int target, int flushdeps) { struct bufqueue *bq; struct buf *sentinel; struct vnode *vp; struct mount *mp; struct buf *bp; int hasdeps; int flushed; int error; bool unlock; flushed = 0; bq = &bd->bd_dirtyq; bp = NULL; sentinel = malloc(sizeof(struct buf), M_TEMP, M_WAITOK | M_ZERO); sentinel->b_qindex = QUEUE_SENTINEL; BQ_LOCK(bq); TAILQ_INSERT_HEAD(&bq->bq_queue, sentinel, b_freelist); BQ_UNLOCK(bq); while (flushed != target) { maybe_yield(); BQ_LOCK(bq); bp = TAILQ_NEXT(sentinel, b_freelist); if (bp != NULL) { TAILQ_REMOVE(&bq->bq_queue, sentinel, b_freelist); TAILQ_INSERT_AFTER(&bq->bq_queue, bp, sentinel, b_freelist); } else { BQ_UNLOCK(bq); break; } /* * Skip sentinels inserted by other invocations of the * flushbufqueues(), taking care to not reorder them. * * Only flush the buffers that belong to the * vnode locked by the curthread. */ if (bp->b_qindex == QUEUE_SENTINEL || (lvp != NULL && bp->b_vp != lvp)) { BQ_UNLOCK(bq); continue; } error = BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL); BQ_UNLOCK(bq); if (error != 0) continue; /* * BKGRDINPROG can only be set with the buf and bufobj * locks both held. We tolerate a race to clear it here. */ if ((bp->b_vflags & BV_BKGRDINPROG) != 0 || (bp->b_flags & B_DELWRI) == 0) { BUF_UNLOCK(bp); continue; } if (bp->b_flags & B_INVAL) { bremfreef(bp); brelse(bp); flushed++; continue; } if (!LIST_EMPTY(&bp->b_dep) && buf_countdeps(bp, 0)) { if (flushdeps == 0) { BUF_UNLOCK(bp); continue; } hasdeps = 1; } else hasdeps = 0; /* * We must hold the lock on a vnode before writing * one of its buffers. Otherwise we may confuse, or * in the case of a snapshot vnode, deadlock the * system. * * The lock order here is the reverse of the normal * of vnode followed by buf lock. This is ok because * the NOWAIT will prevent deadlock. */ vp = bp->b_vp; if (vn_start_write(vp, &mp, V_NOWAIT) != 0) { BUF_UNLOCK(bp); continue; } if (lvp == NULL) { unlock = true; error = vn_lock(vp, LK_EXCLUSIVE | LK_NOWAIT); } else { ASSERT_VOP_LOCKED(vp, "getbuf"); unlock = false; error = VOP_ISLOCKED(vp) == LK_EXCLUSIVE ? 0 : vn_lock(vp, LK_TRYUPGRADE); } if (error == 0) { CTR3(KTR_BUF, "flushbufqueue(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); if (curproc == bufdaemonproc) { vfs_bio_awrite(bp); } else { bremfree(bp); bwrite(bp); counter_u64_add(notbufdflushes, 1); } vn_finished_write(mp); if (unlock) VOP_UNLOCK(vp, 0); flushwithdeps += hasdeps; flushed++; /* * Sleeping on runningbufspace while holding * vnode lock leads to deadlock. */ if (curproc == bufdaemonproc && runningbufspace > hirunningspace) waitrunningbufspace(); continue; } vn_finished_write(mp); BUF_UNLOCK(bp); } BQ_LOCK(bq); TAILQ_REMOVE(&bq->bq_queue, sentinel, b_freelist); BQ_UNLOCK(bq); free(sentinel, M_TEMP); return (flushed); } /* * Check to see if a block is currently memory resident. */ struct buf * incore(struct bufobj *bo, daddr_t blkno) { struct buf *bp; BO_RLOCK(bo); bp = gbincore(bo, blkno); BO_RUNLOCK(bo); return (bp); } /* * Returns true if no I/O is needed to access the * associated VM object. This is like incore except * it also hunts around in the VM system for the data. */ static int inmem(struct vnode * vp, daddr_t blkno) { vm_object_t obj; vm_offset_t toff, tinc, size; vm_page_t m; vm_ooffset_t off; ASSERT_VOP_LOCKED(vp, "inmem"); if (incore(&vp->v_bufobj, blkno)) return 1; if (vp->v_mount == NULL) return 0; obj = vp->v_object; if (obj == NULL) return (0); size = PAGE_SIZE; if (size > vp->v_mount->mnt_stat.f_iosize) size = vp->v_mount->mnt_stat.f_iosize; off = (vm_ooffset_t)blkno * (vm_ooffset_t)vp->v_mount->mnt_stat.f_iosize; VM_OBJECT_RLOCK(obj); for (toff = 0; toff < vp->v_mount->mnt_stat.f_iosize; toff += tinc) { m = vm_page_lookup(obj, OFF_TO_IDX(off + toff)); if (!m) goto notinmem; tinc = size; if (tinc > PAGE_SIZE - ((toff + off) & PAGE_MASK)) tinc = PAGE_SIZE - ((toff + off) & PAGE_MASK); if (vm_page_is_valid(m, (vm_offset_t) ((toff + off) & PAGE_MASK), tinc) == 0) goto notinmem; } VM_OBJECT_RUNLOCK(obj); return 1; notinmem: VM_OBJECT_RUNLOCK(obj); return (0); } /* * Set the dirty range for a buffer based on the status of the dirty * bits in the pages comprising the buffer. The range is limited * to the size of the buffer. * * Tell the VM system that the pages associated with this buffer * are clean. This is used for delayed writes where the data is * going to go to disk eventually without additional VM intevention. * * Note that while we only really need to clean through to b_bcount, we * just go ahead and clean through to b_bufsize. */ static void vfs_clean_pages_dirty_buf(struct buf *bp) { vm_ooffset_t foff, noff, eoff; vm_page_t m; int i; if ((bp->b_flags & B_VMIO) == 0 || bp->b_bufsize == 0) return; foff = bp->b_offset; KASSERT(bp->b_offset != NOOFFSET, ("vfs_clean_pages_dirty_buf: no buffer offset")); VM_OBJECT_WLOCK(bp->b_bufobj->bo_object); vfs_drain_busy_pages(bp); vfs_setdirty_locked_object(bp); for (i = 0; i < bp->b_npages; i++) { noff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK; eoff = noff; if (eoff > bp->b_offset + bp->b_bufsize) eoff = bp->b_offset + bp->b_bufsize; m = bp->b_pages[i]; vfs_page_set_validclean(bp, foff, m); /* vm_page_clear_dirty(m, foff & PAGE_MASK, eoff - foff); */ foff = noff; } VM_OBJECT_WUNLOCK(bp->b_bufobj->bo_object); } static void vfs_setdirty_locked_object(struct buf *bp) { vm_object_t object; int i; object = bp->b_bufobj->bo_object; VM_OBJECT_ASSERT_WLOCKED(object); /* * We qualify the scan for modified pages on whether the * object has been flushed yet. */ if ((object->flags & OBJ_MIGHTBEDIRTY) != 0) { vm_offset_t boffset; vm_offset_t eoffset; /* * test the pages to see if they have been modified directly * by users through the VM system. */ for (i = 0; i < bp->b_npages; i++) vm_page_test_dirty(bp->b_pages[i]); /* * Calculate the encompassing dirty range, boffset and eoffset, * (eoffset - boffset) bytes. */ for (i = 0; i < bp->b_npages; i++) { if (bp->b_pages[i]->dirty) break; } boffset = (i << PAGE_SHIFT) - (bp->b_offset & PAGE_MASK); for (i = bp->b_npages - 1; i >= 0; --i) { if (bp->b_pages[i]->dirty) { break; } } eoffset = ((i + 1) << PAGE_SHIFT) - (bp->b_offset & PAGE_MASK); /* * Fit it to the buffer. */ if (eoffset > bp->b_bcount) eoffset = bp->b_bcount; /* * If we have a good dirty range, merge with the existing * dirty range. */ if (boffset < eoffset) { if (bp->b_dirtyoff > boffset) bp->b_dirtyoff = boffset; if (bp->b_dirtyend < eoffset) bp->b_dirtyend = eoffset; } } } /* * Allocate the KVA mapping for an existing buffer. * If an unmapped buffer is provided but a mapped buffer is requested, take * also care to properly setup mappings between pages and KVA. */ static void bp_unmapped_get_kva(struct buf *bp, daddr_t blkno, int size, int gbflags) { int bsize, maxsize, need_mapping, need_kva; off_t offset; need_mapping = bp->b_data == unmapped_buf && (gbflags & GB_UNMAPPED) == 0; need_kva = bp->b_kvabase == unmapped_buf && bp->b_data == unmapped_buf && (gbflags & GB_KVAALLOC) != 0; if (!need_mapping && !need_kva) return; BUF_CHECK_UNMAPPED(bp); if (need_mapping && bp->b_kvabase != unmapped_buf) { /* * Buffer is not mapped, but the KVA was already * reserved at the time of the instantiation. Use the * allocated space. */ goto has_addr; } /* * Calculate the amount of the address space we would reserve * if the buffer was mapped. */ bsize = vn_isdisk(bp->b_vp, NULL) ? DEV_BSIZE : bp->b_bufobj->bo_bsize; KASSERT(bsize != 0, ("bsize == 0, check bo->bo_bsize")); offset = blkno * bsize; maxsize = size + (offset & PAGE_MASK); maxsize = imax(maxsize, bsize); while (bufkva_alloc(bp, maxsize, gbflags) != 0) { if ((gbflags & GB_NOWAIT_BD) != 0) { /* * XXXKIB: defragmentation cannot * succeed, not sure what else to do. */ panic("GB_NOWAIT_BD and GB_UNMAPPED %p", bp); } counter_u64_add(mappingrestarts, 1); bufspace_wait(bufdomain(bp), bp->b_vp, gbflags, 0, 0); } has_addr: if (need_mapping) { /* b_offset is handled by bpmap_qenter. */ bp->b_data = bp->b_kvabase; BUF_CHECK_MAPPED(bp); bpmap_qenter(bp); } } struct buf * getblk(struct vnode *vp, daddr_t blkno, int size, int slpflag, int slptimeo, int flags) { struct buf *bp; int error; error = getblkx(vp, blkno, size, slpflag, slptimeo, flags, &bp); if (error != 0) return (NULL); return (bp); } /* * getblkx: * * Get a block given a specified block and offset into a file/device. * The buffers B_DONE bit will be cleared on return, making it almost * ready for an I/O initiation. B_INVAL may or may not be set on * return. The caller should clear B_INVAL prior to initiating a * READ. * * For a non-VMIO buffer, B_CACHE is set to the opposite of B_INVAL for * an existing buffer. * * For a VMIO buffer, B_CACHE is modified according to the backing VM. * If getblk()ing a previously 0-sized invalid buffer, B_CACHE is set * and then cleared based on the backing VM. If the previous buffer is * non-0-sized but invalid, B_CACHE will be cleared. * * If getblk() must create a new buffer, the new buffer is returned with * both B_INVAL and B_CACHE clear unless it is a VMIO buffer, in which * case it is returned with B_INVAL clear and B_CACHE set based on the * backing VM. * * getblk() also forces a bwrite() for any B_DELWRI buffer whos * B_CACHE bit is clear. * * What this means, basically, is that the caller should use B_CACHE to * determine whether the buffer is fully valid or not and should clear * B_INVAL prior to issuing a read. If the caller intends to validate * the buffer by loading its data area with something, the caller needs * to clear B_INVAL. If the caller does this without issuing an I/O, * the caller should set B_CACHE ( as an optimization ), else the caller * should issue the I/O and biodone() will set B_CACHE if the I/O was * a write attempt or if it was a successful read. If the caller * intends to issue a READ, the caller must clear B_INVAL and BIO_ERROR * prior to issuing the READ. biodone() will *not* clear B_INVAL. */ int getblkx(struct vnode *vp, daddr_t blkno, int size, int slpflag, int slptimeo, int flags, struct buf **bpp) { struct buf *bp; struct bufobj *bo; daddr_t d_blkno; int bsize, error, maxsize, vmio; off_t offset; CTR3(KTR_BUF, "getblk(%p, %ld, %d)", vp, (long)blkno, size); KASSERT((flags & (GB_UNMAPPED | GB_KVAALLOC)) != GB_KVAALLOC, ("GB_KVAALLOC only makes sense with GB_UNMAPPED")); ASSERT_VOP_LOCKED(vp, "getblk"); if (size > maxbcachebuf) panic("getblk: size(%d) > maxbcachebuf(%d)\n", size, maxbcachebuf); if (!unmapped_buf_allowed) flags &= ~(GB_UNMAPPED | GB_KVAALLOC); bo = &vp->v_bufobj; d_blkno = blkno; loop: BO_RLOCK(bo); bp = gbincore(bo, blkno); if (bp != NULL) { int lockflags; /* * Buffer is in-core. If the buffer is not busy nor managed, * it must be on a queue. */ lockflags = LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK; if ((flags & GB_LOCK_NOWAIT) != 0) lockflags |= LK_NOWAIT; error = BUF_TIMELOCK(bp, lockflags, BO_LOCKPTR(bo), "getblk", slpflag, slptimeo); /* * If we slept and got the lock we have to restart in case * the buffer changed identities. */ if (error == ENOLCK) goto loop; /* We timed out or were interrupted. */ else if (error != 0) return (error); /* If recursed, assume caller knows the rules. */ else if (BUF_LOCKRECURSED(bp)) goto end; /* * The buffer is locked. B_CACHE is cleared if the buffer is * invalid. Otherwise, for a non-VMIO buffer, B_CACHE is set * and for a VMIO buffer B_CACHE is adjusted according to the * backing VM cache. */ if (bp->b_flags & B_INVAL) bp->b_flags &= ~B_CACHE; else if ((bp->b_flags & (B_VMIO | B_INVAL)) == 0) bp->b_flags |= B_CACHE; if (bp->b_flags & B_MANAGED) MPASS(bp->b_qindex == QUEUE_NONE); else bremfree(bp); /* * check for size inconsistencies for non-VMIO case. */ if (bp->b_bcount != size) { if ((bp->b_flags & B_VMIO) == 0 || (size > bp->b_kvasize)) { if (bp->b_flags & B_DELWRI) { bp->b_flags |= B_NOCACHE; bwrite(bp); } else { if (LIST_EMPTY(&bp->b_dep)) { bp->b_flags |= B_RELBUF; brelse(bp); } else { bp->b_flags |= B_NOCACHE; bwrite(bp); } } goto loop; } } /* * Handle the case of unmapped buffer which should * become mapped, or the buffer for which KVA * reservation is requested. */ bp_unmapped_get_kva(bp, blkno, size, flags); /* * If the size is inconsistent in the VMIO case, we can resize * the buffer. This might lead to B_CACHE getting set or * cleared. If the size has not changed, B_CACHE remains * unchanged from its previous state. */ allocbuf(bp, size); KASSERT(bp->b_offset != NOOFFSET, ("getblk: no buffer offset")); /* * A buffer with B_DELWRI set and B_CACHE clear must * be committed before we can return the buffer in * order to prevent the caller from issuing a read * ( due to B_CACHE not being set ) and overwriting * it. * * Most callers, including NFS and FFS, need this to * operate properly either because they assume they * can issue a read if B_CACHE is not set, or because * ( for example ) an uncached B_DELWRI might loop due * to softupdates re-dirtying the buffer. In the latter * case, B_CACHE is set after the first write completes, * preventing further loops. * NOTE! b*write() sets B_CACHE. If we cleared B_CACHE * above while extending the buffer, we cannot allow the * buffer to remain with B_CACHE set after the write * completes or it will represent a corrupt state. To * deal with this we set B_NOCACHE to scrap the buffer * after the write. * * We might be able to do something fancy, like setting * B_CACHE in bwrite() except if B_DELWRI is already set, * so the below call doesn't set B_CACHE, but that gets real * confusing. This is much easier. */ if ((bp->b_flags & (B_CACHE|B_DELWRI)) == B_DELWRI) { bp->b_flags |= B_NOCACHE; bwrite(bp); goto loop; } bp->b_flags &= ~B_DONE; } else { /* * Buffer is not in-core, create new buffer. The buffer * returned by getnewbuf() is locked. Note that the returned * buffer is also considered valid (not marked B_INVAL). */ BO_RUNLOCK(bo); /* * If the user does not want us to create the buffer, bail out * here. */ if (flags & GB_NOCREAT) return (EEXIST); bsize = vn_isdisk(vp, NULL) ? DEV_BSIZE : bo->bo_bsize; KASSERT(bsize != 0, ("bsize == 0, check bo->bo_bsize")); offset = blkno * bsize; vmio = vp->v_object != NULL; if (vmio) { maxsize = size + (offset & PAGE_MASK); } else { maxsize = size; /* Do not allow non-VMIO notmapped buffers. */ flags &= ~(GB_UNMAPPED | GB_KVAALLOC); } maxsize = imax(maxsize, bsize); if ((flags & GB_NOSPARSE) != 0 && vmio && !vn_isdisk(vp, NULL)) { error = VOP_BMAP(vp, blkno, NULL, &d_blkno, 0, 0); KASSERT(error != EOPNOTSUPP, ("GB_NOSPARSE from fs not supporting bmap, vp %p", vp)); if (error != 0) return (error); if (d_blkno == -1) return (EJUSTRETURN); } bp = getnewbuf(vp, slpflag, slptimeo, maxsize, flags); if (bp == NULL) { if (slpflag || slptimeo) return (ETIMEDOUT); /* * XXX This is here until the sleep path is diagnosed * enough to work under very low memory conditions. * * There's an issue on low memory, 4BSD+non-preempt * systems (eg MIPS routers with 32MB RAM) where buffer * exhaustion occurs without sleeping for buffer * reclaimation. This just sticks in a loop and * constantly attempts to allocate a buffer, which * hits exhaustion and tries to wakeup bufdaemon. * This never happens because we never yield. * * The real solution is to identify and fix these cases * so we aren't effectively busy-waiting in a loop * until the reclaimation path has cycles to run. */ kern_yield(PRI_USER); goto loop; } /* * This code is used to make sure that a buffer is not * created while the getnewbuf routine is blocked. * This can be a problem whether the vnode is locked or not. * If the buffer is created out from under us, we have to * throw away the one we just created. * * Note: this must occur before we associate the buffer * with the vp especially considering limitations in * the splay tree implementation when dealing with duplicate * lblkno's. */ BO_LOCK(bo); if (gbincore(bo, blkno)) { BO_UNLOCK(bo); bp->b_flags |= B_INVAL; bufspace_release(bufdomain(bp), maxsize); brelse(bp); goto loop; } /* * Insert the buffer into the hash, so that it can * be found by incore. */ bp->b_lblkno = blkno; bp->b_blkno = d_blkno; bp->b_offset = offset; bgetvp(vp, bp); BO_UNLOCK(bo); /* * set B_VMIO bit. allocbuf() the buffer bigger. Since the * buffer size starts out as 0, B_CACHE will be set by * allocbuf() for the VMIO case prior to it testing the * backing store for validity. */ if (vmio) { bp->b_flags |= B_VMIO; KASSERT(vp->v_object == bp->b_bufobj->bo_object, ("ARGH! different b_bufobj->bo_object %p %p %p\n", bp, vp->v_object, bp->b_bufobj->bo_object)); } else { bp->b_flags &= ~B_VMIO; KASSERT(bp->b_bufobj->bo_object == NULL, ("ARGH! has b_bufobj->bo_object %p %p\n", bp, bp->b_bufobj->bo_object)); BUF_CHECK_MAPPED(bp); } allocbuf(bp, size); bufspace_release(bufdomain(bp), maxsize); bp->b_flags &= ~B_DONE; } CTR4(KTR_BUF, "getblk(%p, %ld, %d) = %p", vp, (long)blkno, size, bp); end: buf_track(bp, __func__); KASSERT(bp->b_bufobj == bo, ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); *bpp = bp; return (0); } /* * Get an empty, disassociated buffer of given size. The buffer is initially * set to B_INVAL. */ struct buf * geteblk(int size, int flags) { struct buf *bp; int maxsize; maxsize = (size + BKVAMASK) & ~BKVAMASK; while ((bp = getnewbuf(NULL, 0, 0, maxsize, flags)) == NULL) { if ((flags & GB_NOWAIT_BD) && (curthread->td_pflags & TDP_BUFNEED) != 0) return (NULL); } allocbuf(bp, size); bufspace_release(bufdomain(bp), maxsize); bp->b_flags |= B_INVAL; /* b_dep cleared by getnewbuf() */ return (bp); } /* * Truncate the backing store for a non-vmio buffer. */ static void vfs_nonvmio_truncate(struct buf *bp, int newbsize) { if (bp->b_flags & B_MALLOC) { /* * malloced buffers are not shrunk */ if (newbsize == 0) { bufmallocadjust(bp, 0); free(bp->b_data, M_BIOBUF); bp->b_data = bp->b_kvabase; bp->b_flags &= ~B_MALLOC; } return; } vm_hold_free_pages(bp, newbsize); bufspace_adjust(bp, newbsize); } /* * Extend the backing for a non-VMIO buffer. */ static void vfs_nonvmio_extend(struct buf *bp, int newbsize) { caddr_t origbuf; int origbufsize; /* * We only use malloced memory on the first allocation. * and revert to page-allocated memory when the buffer * grows. * * There is a potential smp race here that could lead * to bufmallocspace slightly passing the max. It * is probably extremely rare and not worth worrying * over. */ if (bp->b_bufsize == 0 && newbsize <= PAGE_SIZE/2 && bufmallocspace < maxbufmallocspace) { bp->b_data = malloc(newbsize, M_BIOBUF, M_WAITOK); bp->b_flags |= B_MALLOC; bufmallocadjust(bp, newbsize); return; } /* * If the buffer is growing on its other-than-first * allocation then we revert to the page-allocation * scheme. */ origbuf = NULL; origbufsize = 0; if (bp->b_flags & B_MALLOC) { origbuf = bp->b_data; origbufsize = bp->b_bufsize; bp->b_data = bp->b_kvabase; bufmallocadjust(bp, 0); bp->b_flags &= ~B_MALLOC; newbsize = round_page(newbsize); } vm_hold_load_pages(bp, (vm_offset_t) bp->b_data + bp->b_bufsize, (vm_offset_t) bp->b_data + newbsize); if (origbuf != NULL) { bcopy(origbuf, bp->b_data, origbufsize); free(origbuf, M_BIOBUF); } bufspace_adjust(bp, newbsize); } /* * This code constitutes the buffer memory from either anonymous system * memory (in the case of non-VMIO operations) or from an associated * VM object (in the case of VMIO operations). This code is able to * resize a buffer up or down. * * Note that this code is tricky, and has many complications to resolve * deadlock or inconsistent data situations. Tread lightly!!! * There are B_CACHE and B_DELWRI interactions that must be dealt with by * the caller. Calling this code willy nilly can result in the loss of data. * * allocbuf() only adjusts B_CACHE for VMIO buffers. getblk() deals with * B_CACHE for the non-VMIO case. */ int allocbuf(struct buf *bp, int size) { int newbsize; if (bp->b_bcount == size) return (1); if (bp->b_kvasize != 0 && bp->b_kvasize < size) panic("allocbuf: buffer too small"); newbsize = roundup2(size, DEV_BSIZE); if ((bp->b_flags & B_VMIO) == 0) { if ((bp->b_flags & B_MALLOC) == 0) newbsize = round_page(newbsize); /* * Just get anonymous memory from the kernel. Don't * mess with B_CACHE. */ if (newbsize < bp->b_bufsize) vfs_nonvmio_truncate(bp, newbsize); else if (newbsize > bp->b_bufsize) vfs_nonvmio_extend(bp, newbsize); } else { int desiredpages; desiredpages = (size == 0) ? 0 : num_pages((bp->b_offset & PAGE_MASK) + newbsize); if (bp->b_flags & B_MALLOC) panic("allocbuf: VMIO buffer can't be malloced"); /* * Set B_CACHE initially if buffer is 0 length or will become * 0-length. */ if (size == 0 || bp->b_bufsize == 0) bp->b_flags |= B_CACHE; if (newbsize < bp->b_bufsize) vfs_vmio_truncate(bp, desiredpages); /* XXX This looks as if it should be newbsize > b_bufsize */ else if (size > bp->b_bcount) vfs_vmio_extend(bp, desiredpages, size); bufspace_adjust(bp, newbsize); } bp->b_bcount = size; /* requested buffer size. */ return (1); } extern int inflight_transient_maps; static struct bio_queue nondump_bios; void biodone(struct bio *bp) { struct mtx *mtxp; void (*done)(struct bio *); vm_offset_t start, end; biotrack(bp, __func__); /* * Avoid completing I/O when dumping after a panic since that may * result in a deadlock in the filesystem or pager code. Note that * this doesn't affect dumps that were started manually since we aim * to keep the system usable after it has been resumed. */ if (__predict_false(dumping && SCHEDULER_STOPPED())) { TAILQ_INSERT_HEAD(&nondump_bios, bp, bio_queue); return; } if ((bp->bio_flags & BIO_TRANSIENT_MAPPING) != 0) { bp->bio_flags &= ~BIO_TRANSIENT_MAPPING; bp->bio_flags |= BIO_UNMAPPED; start = trunc_page((vm_offset_t)bp->bio_data); end = round_page((vm_offset_t)bp->bio_data + bp->bio_length); bp->bio_data = unmapped_buf; pmap_qremove(start, atop(end - start)); vmem_free(transient_arena, start, end - start); atomic_add_int(&inflight_transient_maps, -1); } done = bp->bio_done; if (done == NULL) { mtxp = mtx_pool_find(mtxpool_sleep, bp); mtx_lock(mtxp); bp->bio_flags |= BIO_DONE; wakeup(bp); mtx_unlock(mtxp); } else done(bp); } /* * Wait for a BIO to finish. */ int biowait(struct bio *bp, const char *wchan) { struct mtx *mtxp; mtxp = mtx_pool_find(mtxpool_sleep, bp); mtx_lock(mtxp); while ((bp->bio_flags & BIO_DONE) == 0) msleep(bp, mtxp, PRIBIO, wchan, 0); mtx_unlock(mtxp); if (bp->bio_error != 0) return (bp->bio_error); if (!(bp->bio_flags & BIO_ERROR)) return (0); return (EIO); } void biofinish(struct bio *bp, struct devstat *stat, int error) { if (error) { bp->bio_error = error; bp->bio_flags |= BIO_ERROR; } if (stat != NULL) devstat_end_transaction_bio(stat, bp); biodone(bp); } #if defined(BUF_TRACKING) || defined(FULL_BUF_TRACKING) void biotrack_buf(struct bio *bp, const char *location) { buf_track(bp->bio_track_bp, location); } #endif /* * bufwait: * * Wait for buffer I/O completion, returning error status. The buffer * is left locked and B_DONE on return. B_EINTR is converted into an EINTR * error and cleared. */ int bufwait(struct buf *bp) { if (bp->b_iocmd == BIO_READ) bwait(bp, PRIBIO, "biord"); else bwait(bp, PRIBIO, "biowr"); if (bp->b_flags & B_EINTR) { bp->b_flags &= ~B_EINTR; return (EINTR); } if (bp->b_ioflags & BIO_ERROR) { return (bp->b_error ? bp->b_error : EIO); } else { return (0); } } /* * bufdone: * * Finish I/O on a buffer, optionally calling a completion function. * This is usually called from an interrupt so process blocking is * not allowed. * * biodone is also responsible for setting B_CACHE in a B_VMIO bp. * In a non-VMIO bp, B_CACHE will be set on the next getblk() * assuming B_INVAL is clear. * * For the VMIO case, we set B_CACHE if the op was a read and no * read error occurred, or if the op was a write. B_CACHE is never * set if the buffer is invalid or otherwise uncacheable. * * bufdone does not mess with B_INVAL, allowing the I/O routine or the * initiator to leave B_INVAL set to brelse the buffer out of existence * in the biodone routine. */ void bufdone(struct buf *bp) { struct bufobj *dropobj; void (*biodone)(struct buf *); buf_track(bp, __func__); CTR3(KTR_BUF, "bufdone(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); dropobj = NULL; KASSERT(!(bp->b_flags & B_DONE), ("biodone: bp %p already done", bp)); runningbufwakeup(bp); if (bp->b_iocmd == BIO_WRITE) dropobj = bp->b_bufobj; /* call optional completion function if requested */ if (bp->b_iodone != NULL) { biodone = bp->b_iodone; bp->b_iodone = NULL; (*biodone) (bp); if (dropobj) bufobj_wdrop(dropobj); return; } if (bp->b_flags & B_VMIO) { /* * Set B_CACHE if the op was a normal read and no error * occurred. B_CACHE is set for writes in the b*write() * routines. */ if (bp->b_iocmd == BIO_READ && !(bp->b_flags & (B_INVAL|B_NOCACHE)) && !(bp->b_ioflags & BIO_ERROR)) bp->b_flags |= B_CACHE; vfs_vmio_iodone(bp); } if (!LIST_EMPTY(&bp->b_dep)) buf_complete(bp); if ((bp->b_flags & B_CKHASH) != 0) { KASSERT(bp->b_iocmd == BIO_READ, ("bufdone: b_iocmd %d not BIO_READ", bp->b_iocmd)); KASSERT(buf_mapped(bp), ("bufdone: bp %p not mapped", bp)); (*bp->b_ckhashcalc)(bp); } /* * For asynchronous completions, release the buffer now. The brelse * will do a wakeup there if necessary - so no need to do a wakeup * here in the async case. The sync case always needs to do a wakeup. */ if (bp->b_flags & B_ASYNC) { if ((bp->b_flags & (B_NOCACHE | B_INVAL | B_RELBUF)) || (bp->b_ioflags & BIO_ERROR)) brelse(bp); else bqrelse(bp); } else bdone(bp); if (dropobj) bufobj_wdrop(dropobj); } /* * This routine is called in lieu of iodone in the case of * incomplete I/O. This keeps the busy status for pages * consistent. */ void vfs_unbusy_pages(struct buf *bp) { int i; vm_object_t obj; vm_page_t m; runningbufwakeup(bp); if (!(bp->b_flags & B_VMIO)) return; obj = bp->b_bufobj->bo_object; VM_OBJECT_WLOCK(obj); for (i = 0; i < bp->b_npages; i++) { m = bp->b_pages[i]; if (m == bogus_page) { m = vm_page_lookup(obj, OFF_TO_IDX(bp->b_offset) + i); if (!m) panic("vfs_unbusy_pages: page missing\n"); bp->b_pages[i] = m; if (buf_mapped(bp)) { BUF_CHECK_MAPPED(bp); pmap_qenter(trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages); } else BUF_CHECK_UNMAPPED(bp); } vm_page_sunbusy(m); } vm_object_pip_wakeupn(obj, bp->b_npages); VM_OBJECT_WUNLOCK(obj); } /* * vfs_page_set_valid: * * Set the valid bits in a page based on the supplied offset. The * range is restricted to the buffer's size. * * This routine is typically called after a read completes. */ static void vfs_page_set_valid(struct buf *bp, vm_ooffset_t off, vm_page_t m) { vm_ooffset_t eoff; /* * Compute the end offset, eoff, such that [off, eoff) does not span a * page boundary and eoff is not greater than the end of the buffer. * The end of the buffer, in this case, is our file EOF, not the * allocation size of the buffer. */ eoff = (off + PAGE_SIZE) & ~(vm_ooffset_t)PAGE_MASK; if (eoff > bp->b_offset + bp->b_bcount) eoff = bp->b_offset + bp->b_bcount; /* * Set valid range. This is typically the entire buffer and thus the * entire page. */ if (eoff > off) vm_page_set_valid_range(m, off & PAGE_MASK, eoff - off); } /* * vfs_page_set_validclean: * * Set the valid bits and clear the dirty bits in a page based on the * supplied offset. The range is restricted to the buffer's size. */ static void vfs_page_set_validclean(struct buf *bp, vm_ooffset_t off, vm_page_t m) { vm_ooffset_t soff, eoff; /* * Start and end offsets in buffer. eoff - soff may not cross a * page boundary or cross the end of the buffer. The end of the * buffer, in this case, is our file EOF, not the allocation size * of the buffer. */ soff = off; eoff = (off + PAGE_SIZE) & ~(off_t)PAGE_MASK; if (eoff > bp->b_offset + bp->b_bcount) eoff = bp->b_offset + bp->b_bcount; /* * Set valid range. This is typically the entire buffer and thus the * entire page. */ if (eoff > soff) { vm_page_set_validclean( m, (vm_offset_t) (soff & PAGE_MASK), (vm_offset_t) (eoff - soff) ); } } /* * Ensure that all buffer pages are not exclusive busied. If any page is * exclusive busy, drain it. */ void vfs_drain_busy_pages(struct buf *bp) { vm_page_t m; int i, last_busied; VM_OBJECT_ASSERT_WLOCKED(bp->b_bufobj->bo_object); last_busied = 0; for (i = 0; i < bp->b_npages; i++) { m = bp->b_pages[i]; if (vm_page_xbusied(m)) { for (; last_busied < i; last_busied++) vm_page_sbusy(bp->b_pages[last_busied]); while (vm_page_xbusied(m)) { vm_page_sleep_if_xbusy(m, "vbpage"); } } } for (i = 0; i < last_busied; i++) vm_page_sunbusy(bp->b_pages[i]); } /* * This routine is called before a device strategy routine. * It is used to tell the VM system that paging I/O is in * progress, and treat the pages associated with the buffer * almost as being exclusive busy. Also the object paging_in_progress * flag is handled to make sure that the object doesn't become * inconsistent. * * Since I/O has not been initiated yet, certain buffer flags * such as BIO_ERROR or B_INVAL may be in an inconsistent state * and should be ignored. */ void vfs_busy_pages(struct buf *bp, int clear_modify) { vm_object_t obj; vm_ooffset_t foff; vm_page_t m; int i; bool bogus; if (!(bp->b_flags & B_VMIO)) return; obj = bp->b_bufobj->bo_object; foff = bp->b_offset; KASSERT(bp->b_offset != NOOFFSET, ("vfs_busy_pages: no buffer offset")); VM_OBJECT_WLOCK(obj); vfs_drain_busy_pages(bp); if (bp->b_bufsize != 0) vfs_setdirty_locked_object(bp); bogus = false; for (i = 0; i < bp->b_npages; i++) { m = bp->b_pages[i]; if ((bp->b_flags & B_CLUSTER) == 0) { vm_object_pip_add(obj, 1); vm_page_sbusy(m); } /* * When readying a buffer for a read ( i.e * clear_modify == 0 ), it is important to do * bogus_page replacement for valid pages in * partially instantiated buffers. Partially * instantiated buffers can, in turn, occur when * reconstituting a buffer from its VM backing store * base. We only have to do this if B_CACHE is * clear ( which causes the I/O to occur in the * first place ). The replacement prevents the read * I/O from overwriting potentially dirty VM-backed * pages. XXX bogus page replacement is, uh, bogus. * It may not work properly with small-block devices. * We need to find a better way. */ if (clear_modify) { pmap_remove_write(m); vfs_page_set_validclean(bp, foff, m); } else if (m->valid == VM_PAGE_BITS_ALL && (bp->b_flags & B_CACHE) == 0) { bp->b_pages[i] = bogus_page; bogus = true; } foff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK; } VM_OBJECT_WUNLOCK(obj); if (bogus && buf_mapped(bp)) { BUF_CHECK_MAPPED(bp); pmap_qenter(trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages); } } /* * vfs_bio_set_valid: * * Set the range within the buffer to valid. The range is * relative to the beginning of the buffer, b_offset. Note that * b_offset itself may be offset from the beginning of the first * page. */ void vfs_bio_set_valid(struct buf *bp, int base, int size) { int i, n; vm_page_t m; if (!(bp->b_flags & B_VMIO)) return; /* * Fixup base to be relative to beginning of first page. * Set initial n to be the maximum number of bytes in the * first page that can be validated. */ base += (bp->b_offset & PAGE_MASK); n = PAGE_SIZE - (base & PAGE_MASK); VM_OBJECT_WLOCK(bp->b_bufobj->bo_object); for (i = base / PAGE_SIZE; size > 0 && i < bp->b_npages; ++i) { m = bp->b_pages[i]; if (n > size) n = size; vm_page_set_valid_range(m, base & PAGE_MASK, n); base += n; size -= n; n = PAGE_SIZE; } VM_OBJECT_WUNLOCK(bp->b_bufobj->bo_object); } /* * vfs_bio_clrbuf: * * If the specified buffer is a non-VMIO buffer, clear the entire * buffer. If the specified buffer is a VMIO buffer, clear and * validate only the previously invalid portions of the buffer. * This routine essentially fakes an I/O, so we need to clear * BIO_ERROR and B_INVAL. * * Note that while we only theoretically need to clear through b_bcount, * we go ahead and clear through b_bufsize. */ void vfs_bio_clrbuf(struct buf *bp) { int i, j, mask, sa, ea, slide; if ((bp->b_flags & (B_VMIO | B_MALLOC)) != B_VMIO) { clrbuf(bp); return; } bp->b_flags &= ~B_INVAL; bp->b_ioflags &= ~BIO_ERROR; VM_OBJECT_WLOCK(bp->b_bufobj->bo_object); if ((bp->b_npages == 1) && (bp->b_bufsize < PAGE_SIZE) && (bp->b_offset & PAGE_MASK) == 0) { if (bp->b_pages[0] == bogus_page) goto unlock; mask = (1 << (bp->b_bufsize / DEV_BSIZE)) - 1; VM_OBJECT_ASSERT_WLOCKED(bp->b_pages[0]->object); if ((bp->b_pages[0]->valid & mask) == mask) goto unlock; if ((bp->b_pages[0]->valid & mask) == 0) { pmap_zero_page_area(bp->b_pages[0], 0, bp->b_bufsize); bp->b_pages[0]->valid |= mask; goto unlock; } } sa = bp->b_offset & PAGE_MASK; slide = 0; for (i = 0; i < bp->b_npages; i++, sa = 0) { slide = imin(slide + PAGE_SIZE, bp->b_offset + bp->b_bufsize); ea = slide & PAGE_MASK; if (ea == 0) ea = PAGE_SIZE; if (bp->b_pages[i] == bogus_page) continue; j = sa / DEV_BSIZE; mask = ((1 << ((ea - sa) / DEV_BSIZE)) - 1) << j; VM_OBJECT_ASSERT_WLOCKED(bp->b_pages[i]->object); if ((bp->b_pages[i]->valid & mask) == mask) continue; if ((bp->b_pages[i]->valid & mask) == 0) pmap_zero_page_area(bp->b_pages[i], sa, ea - sa); else { for (; sa < ea; sa += DEV_BSIZE, j++) { if ((bp->b_pages[i]->valid & (1 << j)) == 0) { pmap_zero_page_area(bp->b_pages[i], sa, DEV_BSIZE); } } } bp->b_pages[i]->valid |= mask; } unlock: VM_OBJECT_WUNLOCK(bp->b_bufobj->bo_object); bp->b_resid = 0; } void vfs_bio_bzero_buf(struct buf *bp, int base, int size) { vm_page_t m; int i, n; if (buf_mapped(bp)) { BUF_CHECK_MAPPED(bp); bzero(bp->b_data + base, size); } else { BUF_CHECK_UNMAPPED(bp); n = PAGE_SIZE - (base & PAGE_MASK); for (i = base / PAGE_SIZE; size > 0 && i < bp->b_npages; ++i) { m = bp->b_pages[i]; if (n > size) n = size; pmap_zero_page_area(m, base & PAGE_MASK, n); base += n; size -= n; n = PAGE_SIZE; } } } /* * Update buffer flags based on I/O request parameters, optionally releasing the * buffer. If it's VMIO or direct I/O, the buffer pages are released to the VM, * where they may be placed on a page queue (VMIO) or freed immediately (direct * I/O). Otherwise the buffer is released to the cache. */ static void b_io_dismiss(struct buf *bp, int ioflag, bool release) { KASSERT((ioflag & IO_NOREUSE) == 0 || (ioflag & IO_VMIO) != 0, ("buf %p non-VMIO noreuse", bp)); if ((ioflag & IO_DIRECT) != 0) bp->b_flags |= B_DIRECT; if ((ioflag & IO_EXT) != 0) bp->b_xflags |= BX_ALTDATA; if ((ioflag & (IO_VMIO | IO_DIRECT)) != 0 && LIST_EMPTY(&bp->b_dep)) { bp->b_flags |= B_RELBUF; if ((ioflag & IO_NOREUSE) != 0) bp->b_flags |= B_NOREUSE; if (release) brelse(bp); } else if (release) bqrelse(bp); } void vfs_bio_brelse(struct buf *bp, int ioflag) { b_io_dismiss(bp, ioflag, true); } void vfs_bio_set_flags(struct buf *bp, int ioflag) { b_io_dismiss(bp, ioflag, false); } /* * vm_hold_load_pages and vm_hold_free_pages get pages into * a buffers address space. The pages are anonymous and are * not associated with a file object. */ static void vm_hold_load_pages(struct buf *bp, vm_offset_t from, vm_offset_t to) { vm_offset_t pg; vm_page_t p; int index; BUF_CHECK_MAPPED(bp); to = round_page(to); from = round_page(from); index = (from - trunc_page((vm_offset_t)bp->b_data)) >> PAGE_SHIFT; for (pg = from; pg < to; pg += PAGE_SIZE, index++) { /* * note: must allocate system pages since blocking here * could interfere with paging I/O, no matter which * process we are. */ p = vm_page_alloc(NULL, 0, VM_ALLOC_SYSTEM | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_COUNT((to - pg) >> PAGE_SHIFT) | VM_ALLOC_WAITOK); pmap_qenter(pg, &p, 1); bp->b_pages[index] = p; } bp->b_npages = index; } /* Return pages associated with this buf to the vm system */ static void vm_hold_free_pages(struct buf *bp, int newbsize) { vm_offset_t from; vm_page_t p; int index, newnpages; BUF_CHECK_MAPPED(bp); from = round_page((vm_offset_t)bp->b_data + newbsize); newnpages = (from - trunc_page((vm_offset_t)bp->b_data)) >> PAGE_SHIFT; if (bp->b_npages > newnpages) pmap_qremove(from, bp->b_npages - newnpages); for (index = newnpages; index < bp->b_npages; index++) { p = bp->b_pages[index]; bp->b_pages[index] = NULL; vm_page_unwire_noq(p); vm_page_free(p); } bp->b_npages = newnpages; } /* * Map an IO request into kernel virtual address space. * * All requests are (re)mapped into kernel VA space. * Notice that we use b_bufsize for the size of the buffer * to be mapped. b_bcount might be modified by the driver. * * Note that even if the caller determines that the address space should * be valid, a race or a smaller-file mapped into a larger space may * actually cause vmapbuf() to fail, so all callers of vmapbuf() MUST * check the return value. * * This function only works with pager buffers. */ int vmapbuf(struct buf *bp, int mapbuf) { vm_prot_t prot; int pidx; if (bp->b_bufsize < 0) return (-1); prot = VM_PROT_READ; if (bp->b_iocmd == BIO_READ) prot |= VM_PROT_WRITE; /* Less backwards than it looks */ if ((pidx = vm_fault_quick_hold_pages(&curproc->p_vmspace->vm_map, (vm_offset_t)bp->b_data, bp->b_bufsize, prot, bp->b_pages, btoc(MAXPHYS))) < 0) return (-1); bp->b_npages = pidx; bp->b_offset = ((vm_offset_t)bp->b_data) & PAGE_MASK; if (mapbuf || !unmapped_buf_allowed) { pmap_qenter((vm_offset_t)bp->b_kvabase, bp->b_pages, pidx); bp->b_data = bp->b_kvabase + bp->b_offset; } else bp->b_data = unmapped_buf; return(0); } /* * Free the io map PTEs associated with this IO operation. * We also invalidate the TLB entries and restore the original b_addr. * * This function only works with pager buffers. */ void vunmapbuf(struct buf *bp) { int npages; npages = bp->b_npages; if (buf_mapped(bp)) pmap_qremove(trunc_page((vm_offset_t)bp->b_data), npages); vm_page_unhold_pages(bp->b_pages, npages); bp->b_data = unmapped_buf; } void bdone(struct buf *bp) { struct mtx *mtxp; mtxp = mtx_pool_find(mtxpool_sleep, bp); mtx_lock(mtxp); bp->b_flags |= B_DONE; wakeup(bp); mtx_unlock(mtxp); } void bwait(struct buf *bp, u_char pri, const char *wchan) { struct mtx *mtxp; mtxp = mtx_pool_find(mtxpool_sleep, bp); mtx_lock(mtxp); while ((bp->b_flags & B_DONE) == 0) msleep(bp, mtxp, pri, wchan, 0); mtx_unlock(mtxp); } int bufsync(struct bufobj *bo, int waitfor) { return (VOP_FSYNC(bo2vnode(bo), waitfor, curthread)); } void bufstrategy(struct bufobj *bo, struct buf *bp) { int i __unused; struct vnode *vp; vp = bp->b_vp; KASSERT(vp == bo->bo_private, ("Inconsistent vnode bufstrategy")); KASSERT(vp->v_type != VCHR && vp->v_type != VBLK, ("Wrong vnode in bufstrategy(bp=%p, vp=%p)", bp, vp)); i = VOP_STRATEGY(vp, bp); KASSERT(i == 0, ("VOP_STRATEGY failed bp=%p vp=%p", bp, bp->b_vp)); } /* * Initialize a struct bufobj before use. Memory is assumed zero filled. */ void bufobj_init(struct bufobj *bo, void *private) { static volatile int bufobj_cleanq; bo->bo_domain = atomic_fetchadd_int(&bufobj_cleanq, 1) % buf_domains; rw_init(BO_LOCKPTR(bo), "bufobj interlock"); bo->bo_private = private; TAILQ_INIT(&bo->bo_clean.bv_hd); TAILQ_INIT(&bo->bo_dirty.bv_hd); } void bufobj_wrefl(struct bufobj *bo) { KASSERT(bo != NULL, ("NULL bo in bufobj_wref")); ASSERT_BO_WLOCKED(bo); bo->bo_numoutput++; } void bufobj_wref(struct bufobj *bo) { KASSERT(bo != NULL, ("NULL bo in bufobj_wref")); BO_LOCK(bo); bo->bo_numoutput++; BO_UNLOCK(bo); } void bufobj_wdrop(struct bufobj *bo) { KASSERT(bo != NULL, ("NULL bo in bufobj_wdrop")); BO_LOCK(bo); KASSERT(bo->bo_numoutput > 0, ("bufobj_wdrop non-positive count")); if ((--bo->bo_numoutput == 0) && (bo->bo_flag & BO_WWAIT)) { bo->bo_flag &= ~BO_WWAIT; wakeup(&bo->bo_numoutput); } BO_UNLOCK(bo); } int bufobj_wwait(struct bufobj *bo, int slpflag, int timeo) { int error; KASSERT(bo != NULL, ("NULL bo in bufobj_wwait")); ASSERT_BO_WLOCKED(bo); error = 0; while (bo->bo_numoutput) { bo->bo_flag |= BO_WWAIT; error = msleep(&bo->bo_numoutput, BO_LOCKPTR(bo), slpflag | (PRIBIO + 1), "bo_wwait", timeo); if (error) break; } return (error); } /* * Set bio_data or bio_ma for struct bio from the struct buf. */ void bdata2bio(struct buf *bp, struct bio *bip) { if (!buf_mapped(bp)) { KASSERT(unmapped_buf_allowed, ("unmapped")); bip->bio_ma = bp->b_pages; bip->bio_ma_n = bp->b_npages; bip->bio_data = unmapped_buf; bip->bio_ma_offset = (vm_offset_t)bp->b_offset & PAGE_MASK; bip->bio_flags |= BIO_UNMAPPED; KASSERT(round_page(bip->bio_ma_offset + bip->bio_length) / PAGE_SIZE == bp->b_npages, ("Buffer %p too short: %d %lld %d", bp, bip->bio_ma_offset, (long long)bip->bio_length, bip->bio_ma_n)); } else { bip->bio_data = bp->b_data; bip->bio_ma = NULL; } } /* * The MIPS pmap code currently doesn't handle aliased pages. * The VIPT caches may not handle page aliasing themselves, leading * to data corruption. * * As such, this code makes a system extremely unhappy if said * system doesn't support unaliasing the above situation in hardware. * Some "recent" systems (eg some mips24k/mips74k cores) don't enable * this feature at build time, so it has to be handled in software. * * Once the MIPS pmap/cache code grows to support this function on * earlier chips, it should be flipped back off. */ #ifdef __mips__ static int buf_pager_relbuf = 1; #else static int buf_pager_relbuf = 0; #endif SYSCTL_INT(_vfs, OID_AUTO, buf_pager_relbuf, CTLFLAG_RWTUN, &buf_pager_relbuf, 0, "Make buffer pager release buffers after reading"); /* * The buffer pager. It uses buffer reads to validate pages. * * In contrast to the generic local pager from vm/vnode_pager.c, this * pager correctly and easily handles volumes where the underlying * device block size is greater than the machine page size. The * buffer cache transparently extends the requested page run to be * aligned at the block boundary, and does the necessary bogus page * replacements in the addends to avoid obliterating already valid * pages. * * The only non-trivial issue is that the exclusive busy state for * pages, which is assumed by the vm_pager_getpages() interface, is * incompatible with the VMIO buffer cache's desire to share-busy the * pages. This function performs a trivial downgrade of the pages' * state before reading buffers, and a less trivial upgrade from the * shared-busy to excl-busy state after the read. */ int vfs_bio_getpages(struct vnode *vp, vm_page_t *ma, int count, int *rbehind, int *rahead, vbg_get_lblkno_t get_lblkno, vbg_get_blksize_t get_blksize) { vm_page_t m; vm_object_t object; struct buf *bp; struct mount *mp; daddr_t lbn, lbnp; vm_ooffset_t la, lb, poff, poffe; long bsize; int bo_bs, br_flags, error, i, pgsin, pgsin_a, pgsin_b; bool redo, lpart; object = vp->v_object; mp = vp->v_mount; error = 0; la = IDX_TO_OFF(ma[count - 1]->pindex); if (la >= object->un_pager.vnp.vnp_size) return (VM_PAGER_BAD); /* * Change the meaning of la from where the last requested page starts * to where it ends, because that's the end of the requested region * and the start of the potential read-ahead region. */ la += PAGE_SIZE; lpart = la > object->un_pager.vnp.vnp_size; bo_bs = get_blksize(vp, get_lblkno(vp, IDX_TO_OFF(ma[0]->pindex))); /* * Calculate read-ahead, behind and total pages. */ pgsin = count; lb = IDX_TO_OFF(ma[0]->pindex); pgsin_b = OFF_TO_IDX(lb - rounddown2(lb, bo_bs)); pgsin += pgsin_b; if (rbehind != NULL) *rbehind = pgsin_b; pgsin_a = OFF_TO_IDX(roundup2(la, bo_bs) - la); if (la + IDX_TO_OFF(pgsin_a) >= object->un_pager.vnp.vnp_size) pgsin_a = OFF_TO_IDX(roundup2(object->un_pager.vnp.vnp_size, PAGE_SIZE) - la); pgsin += pgsin_a; if (rahead != NULL) *rahead = pgsin_a; VM_CNT_INC(v_vnodein); VM_CNT_ADD(v_vnodepgsin, pgsin); br_flags = (mp != NULL && (mp->mnt_kern_flag & MNTK_UNMAPPED_BUFS) != 0) ? GB_UNMAPPED : 0; VM_OBJECT_WLOCK(object); again: for (i = 0; i < count; i++) vm_page_busy_downgrade(ma[i]); VM_OBJECT_WUNLOCK(object); lbnp = -1; for (i = 0; i < count; i++) { m = ma[i]; /* * Pages are shared busy and the object lock is not * owned, which together allow for the pages' * invalidation. The racy test for validity avoids * useless creation of the buffer for the most typical * case when invalidation is not used in redo or for * parallel read. The shared->excl upgrade loop at * the end of the function catches the race in a * reliable way (protected by the object lock). */ if (m->valid == VM_PAGE_BITS_ALL) continue; poff = IDX_TO_OFF(m->pindex); poffe = MIN(poff + PAGE_SIZE, object->un_pager.vnp.vnp_size); for (; poff < poffe; poff += bsize) { lbn = get_lblkno(vp, poff); if (lbn == lbnp) goto next_page; lbnp = lbn; bsize = get_blksize(vp, lbn); error = bread_gb(vp, lbn, bsize, curthread->td_ucred, br_flags, &bp); if (error != 0) goto end_pages; if (LIST_EMPTY(&bp->b_dep)) { /* * Invalidation clears m->valid, but * may leave B_CACHE flag if the * buffer existed at the invalidation * time. In this case, recycle the * buffer to do real read on next * bread() after redo. * * Otherwise B_RELBUF is not strictly * necessary, enable to reduce buf * cache pressure. */ if (buf_pager_relbuf || m->valid != VM_PAGE_BITS_ALL) bp->b_flags |= B_RELBUF; bp->b_flags &= ~B_NOCACHE; brelse(bp); } else { bqrelse(bp); } } KASSERT(1 /* racy, enable for debugging */ || m->valid == VM_PAGE_BITS_ALL || i == count - 1, ("buf %d %p invalid", i, m)); if (i == count - 1 && lpart) { VM_OBJECT_WLOCK(object); if (m->valid != 0 && m->valid != VM_PAGE_BITS_ALL) vm_page_zero_invalid(m, TRUE); VM_OBJECT_WUNLOCK(object); } next_page:; } end_pages: VM_OBJECT_WLOCK(object); redo = false; for (i = 0; i < count; i++) { vm_page_sunbusy(ma[i]); ma[i] = vm_page_grab(object, ma[i]->pindex, VM_ALLOC_NORMAL); /* * Since the pages were only sbusy while neither the * buffer nor the object lock was held by us, or * reallocated while vm_page_grab() slept for busy * relinguish, they could have been invalidated. * Recheck the valid bits and re-read as needed. * * Note that the last page is made fully valid in the * read loop, and partial validity for the page at * index count - 1 could mean that the page was * invalidated or removed, so we must restart for * safety as well. */ if (ma[i]->valid != VM_PAGE_BITS_ALL) redo = true; } if (redo && error == 0) goto again; VM_OBJECT_WUNLOCK(object); return (error != 0 ? VM_PAGER_ERROR : VM_PAGER_OK); } #include "opt_ddb.h" #ifdef DDB #include /* DDB command to show buffer data */ DB_SHOW_COMMAND(buffer, db_show_buffer) { /* get args */ struct buf *bp = (struct buf *)addr; #ifdef FULL_BUF_TRACKING uint32_t i, j; #endif if (!have_addr) { db_printf("usage: show buffer \n"); return; } db_printf("buf at %p\n", bp); db_printf("b_flags = 0x%b, b_xflags=0x%b\n", (u_int)bp->b_flags, PRINT_BUF_FLAGS, (u_int)bp->b_xflags, PRINT_BUF_XFLAGS); db_printf("b_vflags=0x%b b_ioflags0x%b\n", (u_int)bp->b_vflags, PRINT_BUF_VFLAGS, (u_int)bp->b_ioflags, PRINT_BIO_FLAGS); db_printf( "b_error = %d, b_bufsize = %ld, b_bcount = %ld, b_resid = %ld\n" "b_bufobj = (%p), b_data = %p\n, b_blkno = %jd, b_lblkno = %jd, " "b_vp = %p, b_dep = %p\n", bp->b_error, bp->b_bufsize, bp->b_bcount, bp->b_resid, bp->b_bufobj, bp->b_data, (intmax_t)bp->b_blkno, (intmax_t)bp->b_lblkno, bp->b_vp, bp->b_dep.lh_first); db_printf("b_kvabase = %p, b_kvasize = %d\n", bp->b_kvabase, bp->b_kvasize); if (bp->b_npages) { int i; db_printf("b_npages = %d, pages(OBJ, IDX, PA): ", bp->b_npages); for (i = 0; i < bp->b_npages; i++) { vm_page_t m; m = bp->b_pages[i]; if (m != NULL) db_printf("(%p, 0x%lx, 0x%lx)", m->object, (u_long)m->pindex, (u_long)VM_PAGE_TO_PHYS(m)); else db_printf("( ??? )"); if ((i + 1) < bp->b_npages) db_printf(","); } db_printf("\n"); } BUF_LOCKPRINTINFO(bp); #if defined(FULL_BUF_TRACKING) db_printf("b_io_tracking: b_io_tcnt = %u\n", bp->b_io_tcnt); i = bp->b_io_tcnt % BUF_TRACKING_SIZE; for (j = 1; j <= BUF_TRACKING_SIZE; j++) { if (bp->b_io_tracking[BUF_TRACKING_ENTRY(i - j)] == NULL) continue; db_printf(" %2u: %s\n", j, bp->b_io_tracking[BUF_TRACKING_ENTRY(i - j)]); } #elif defined(BUF_TRACKING) db_printf("b_io_tracking: %s\n", bp->b_io_tracking); #endif db_printf(" "); } DB_SHOW_COMMAND(bufqueues, bufqueues) { struct bufdomain *bd; struct buf *bp; long total; int i, j, cnt; db_printf("bqempty: %d\n", bqempty.bq_len); for (i = 0; i < buf_domains; i++) { bd = &bdomain[i]; db_printf("Buf domain %d\n", i); db_printf("\tfreebufs\t%d\n", bd->bd_freebuffers); db_printf("\tlofreebufs\t%d\n", bd->bd_lofreebuffers); db_printf("\thifreebufs\t%d\n", bd->bd_hifreebuffers); db_printf("\n"); db_printf("\tbufspace\t%ld\n", bd->bd_bufspace); db_printf("\tmaxbufspace\t%ld\n", bd->bd_maxbufspace); db_printf("\thibufspace\t%ld\n", bd->bd_hibufspace); db_printf("\tlobufspace\t%ld\n", bd->bd_lobufspace); db_printf("\tbufspacethresh\t%ld\n", bd->bd_bufspacethresh); db_printf("\n"); db_printf("\tnumdirtybuffers\t%d\n", bd->bd_numdirtybuffers); db_printf("\tlodirtybuffers\t%d\n", bd->bd_lodirtybuffers); db_printf("\thidirtybuffers\t%d\n", bd->bd_hidirtybuffers); db_printf("\tdirtybufthresh\t%d\n", bd->bd_dirtybufthresh); db_printf("\n"); total = 0; TAILQ_FOREACH(bp, &bd->bd_cleanq->bq_queue, b_freelist) total += bp->b_bufsize; db_printf("\tcleanq count\t%d (%ld)\n", bd->bd_cleanq->bq_len, total); total = 0; TAILQ_FOREACH(bp, &bd->bd_dirtyq.bq_queue, b_freelist) total += bp->b_bufsize; db_printf("\tdirtyq count\t%d (%ld)\n", bd->bd_dirtyq.bq_len, total); db_printf("\twakeup\t\t%d\n", bd->bd_wanted); db_printf("\tlim\t\t%d\n", bd->bd_lim); db_printf("\tCPU "); for (j = 0; j <= mp_maxid; j++) db_printf("%d, ", bd->bd_subq[j].bq_len); db_printf("\n"); cnt = 0; total = 0; for (j = 0; j < nbuf; j++) if (buf[j].b_domain == i && BUF_ISLOCKED(&buf[j])) { cnt++; total += buf[j].b_bufsize; } db_printf("\tLocked buffers: %d space %ld\n", cnt, total); cnt = 0; total = 0; for (j = 0; j < nbuf; j++) if (buf[j].b_domain == i) { cnt++; total += buf[j].b_bufsize; } db_printf("\tTotal buffers: %d space %ld\n", cnt, total); } } DB_SHOW_COMMAND(lockedbufs, lockedbufs) { struct buf *bp; int i; for (i = 0; i < nbuf; i++) { bp = &buf[i]; if (BUF_ISLOCKED(bp)) { db_show_buffer((uintptr_t)bp, 1, 0, NULL); db_printf("\n"); if (db_pager_quit) break; } } } DB_SHOW_COMMAND(vnodebufs, db_show_vnodebufs) { struct vnode *vp; struct buf *bp; if (!have_addr) { db_printf("usage: show vnodebufs \n"); return; } vp = (struct vnode *)addr; db_printf("Clean buffers:\n"); TAILQ_FOREACH(bp, &vp->v_bufobj.bo_clean.bv_hd, b_bobufs) { db_show_buffer((uintptr_t)bp, 1, 0, NULL); db_printf("\n"); } db_printf("Dirty buffers:\n"); TAILQ_FOREACH(bp, &vp->v_bufobj.bo_dirty.bv_hd, b_bobufs) { db_show_buffer((uintptr_t)bp, 1, 0, NULL); db_printf("\n"); } } DB_COMMAND(countfreebufs, db_coundfreebufs) { struct buf *bp; int i, used = 0, nfree = 0; if (have_addr) { db_printf("usage: countfreebufs\n"); return; } for (i = 0; i < nbuf; i++) { bp = &buf[i]; if (bp->b_qindex == QUEUE_EMPTY) nfree++; else used++; } db_printf("Counted %d free, %d used (%d tot)\n", nfree, used, nfree + used); db_printf("numfreebuffers is %d\n", numfreebuffers); } #endif /* DDB */ Index: head/sys/sys/buf.h =================================================================== --- head/sys/sys/buf.h (revision 352232) +++ head/sys/sys/buf.h (revision 352233) @@ -1,584 +1,589 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1989, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)buf.h 8.9 (Berkeley) 3/30/95 * $FreeBSD$ */ #ifndef _SYS_BUF_H_ #define _SYS_BUF_H_ #include #include #include #include #include struct bio; struct buf; struct bufobj; struct mount; struct vnode; struct uio; /* * To avoid including */ LIST_HEAD(workhead, worklist); /* * These are currently used only by the soft dependency code, hence * are stored once in a global variable. If other subsystems wanted * to use these hooks, a pointer to a set of bio_ops could be added * to each buffer. */ extern struct bio_ops { void (*io_start)(struct buf *); void (*io_complete)(struct buf *); void (*io_deallocate)(struct buf *); int (*io_countdeps)(struct buf *, int); } bioops; struct vm_object; struct vm_page; typedef uint32_t b_xflags_t; /* * The buffer header describes an I/O operation in the kernel. * * NOTES: * b_bufsize, b_bcount. b_bufsize is the allocation size of the * buffer, either DEV_BSIZE or PAGE_SIZE aligned. b_bcount is the * originally requested buffer size and can serve as a bounds check * against EOF. For most, but not all uses, b_bcount == b_bufsize. * * b_dirtyoff, b_dirtyend. Buffers support piecemeal, unaligned * ranges of dirty data that need to be written to backing store. * The range is typically clipped at b_bcount ( not b_bufsize ). * * b_resid. Number of bytes remaining in I/O. After an I/O operation * completes, b_resid is usually 0 indicating 100% success. * * All fields are protected by the buffer lock except those marked: * V - Protected by owning bufobj lock * Q - Protected by the buf queue lock * D - Protected by an dependency implementation specific lock */ struct buf { struct bufobj *b_bufobj; long b_bcount; void *b_caller1; caddr_t b_data; int b_error; uint16_t b_iocmd; /* BIO_* bio_cmd from bio.h */ uint16_t b_ioflags; /* BIO_* bio_flags from bio.h */ off_t b_iooffset; long b_resid; void (*b_iodone)(struct buf *); void (*b_ckhashcalc)(struct buf *); uint64_t b_ckhash; /* B_CKHASH requested check-hash */ daddr_t b_blkno; /* Underlying physical block number. */ off_t b_offset; /* Offset into file. */ TAILQ_ENTRY(buf) b_bobufs; /* (V) Buffer's associated vnode. */ uint32_t b_vflags; /* (V) BV_* flags */ uint8_t b_qindex; /* (Q) buffer queue index */ uint8_t b_domain; /* (Q) buf domain this resides in */ uint16_t b_subqueue; /* (Q) per-cpu q if any */ uint32_t b_flags; /* B_* flags. */ b_xflags_t b_xflags; /* extra flags */ struct lock b_lock; /* Buffer lock */ long b_bufsize; /* Allocated buffer size. */ int b_runningbufspace; /* when I/O is running, pipelining */ int b_kvasize; /* size of kva for buffer */ int b_dirtyoff; /* Offset in buffer of dirty region. */ int b_dirtyend; /* Offset of end of dirty region. */ caddr_t b_kvabase; /* base kva for buffer */ daddr_t b_lblkno; /* Logical block number. */ struct vnode *b_vp; /* Device vnode. */ struct ucred *b_rcred; /* Read credentials reference. */ struct ucred *b_wcred; /* Write credentials reference. */ union { TAILQ_ENTRY(buf) b_freelist; /* (Q) */ struct { void (*b_pgiodone)(void *, vm_page_t *, int, int); int b_pgbefore; int b_pgafter; }; }; union cluster_info { TAILQ_HEAD(cluster_list_head, buf) cluster_head; TAILQ_ENTRY(buf) cluster_entry; } b_cluster; struct vm_page *b_pages[btoc(MAXPHYS)]; int b_npages; struct workhead b_dep; /* (D) List of filesystem dependencies. */ void *b_fsprivate1; void *b_fsprivate2; void *b_fsprivate3; #if defined(FULL_BUF_TRACKING) #define BUF_TRACKING_SIZE 32 #define BUF_TRACKING_ENTRY(x) ((x) & (BUF_TRACKING_SIZE - 1)) const char *b_io_tracking[BUF_TRACKING_SIZE]; uint32_t b_io_tcnt; #elif defined(BUF_TRACKING) const char *b_io_tracking; #endif }; #define b_object b_bufobj->bo_object /* * These flags are kept in b_flags. * * Notes: * * B_ASYNC VOP calls on bp's are usually async whether or not * B_ASYNC is set, but some subsystems, such as NFS, like * to know what is best for the caller so they can * optimize the I/O. * * B_PAGING Indicates that bp is being used by the paging system or * some paging system and that the bp is not linked into * the b_vp's clean/dirty linked lists or ref counts. * Buffer vp reassignments are illegal in this case. * * B_CACHE This may only be set if the buffer is entirely valid. * The situation where B_DELWRI is set and B_CACHE is * clear MUST be committed to disk by getblk() so * B_DELWRI can also be cleared. See the comments for * getblk() in kern/vfs_bio.c. If B_CACHE is clear, * the caller is expected to clear BIO_ERROR and B_INVAL, * set BIO_READ, and initiate an I/O. * * The 'entire buffer' is defined to be the range from * 0 through b_bcount. * * B_MALLOC Request that the buffer be allocated from the malloc * pool, DEV_BSIZE aligned instead of PAGE_SIZE aligned. * * B_CLUSTEROK This flag is typically set for B_DELWRI buffers * by filesystems that allow clustering when the buffer * is fully dirty and indicates that it may be clustered * with other adjacent dirty buffers. Note the clustering * may not be used with the stage 1 data write under NFS * but may be used for the commit rpc portion. * + * B_INVALONERR This flag is set on dirty buffers. It specifies that a + * write error should forcibly invalidate the buffer + * contents. This flag should be used with caution, as it + * discards data. It is incompatible with B_ASYNC. + * * B_VMIO Indicates that the buffer is tied into an VM object. * The buffer's data is always PAGE_SIZE aligned even * if b_bufsize and b_bcount are not. ( b_bufsize is * always at least DEV_BSIZE aligned, though ). * * B_DIRECT Hint that we should attempt to completely free * the pages underlying the buffer. B_DIRECT is * sticky until the buffer is released and typically * only has an effect when B_RELBUF is also set. * */ #define B_AGE 0x00000001 /* Move to age queue when I/O done. */ #define B_NEEDCOMMIT 0x00000002 /* Append-write in progress. */ #define B_ASYNC 0x00000004 /* Start I/O, do not wait. */ #define B_DIRECT 0x00000008 /* direct I/O flag (pls free vmio) */ #define B_DEFERRED 0x00000010 /* Skipped over for cleaning */ #define B_CACHE 0x00000020 /* Bread found us in the cache. */ #define B_VALIDSUSPWRT 0x00000040 /* Valid write during suspension. */ #define B_DELWRI 0x00000080 /* Delay I/O until buffer reused. */ #define B_CKHASH 0x00000100 /* checksum hash calculated on read */ #define B_DONE 0x00000200 /* I/O completed. */ #define B_EINTR 0x00000400 /* I/O was interrupted */ #define B_NOREUSE 0x00000800 /* Contents not reused once released. */ #define B_REUSE 0x00001000 /* Contents reused, second chance. */ #define B_INVAL 0x00002000 /* Does not contain valid info. */ #define B_BARRIER 0x00004000 /* Write this and all preceding first. */ #define B_NOCACHE 0x00008000 /* Do not cache block after use. */ #define B_MALLOC 0x00010000 /* malloced b_data */ #define B_CLUSTEROK 0x00020000 /* Pagein op, so swap() can count it. */ -#define B_00040000 0x00040000 /* Available flag. */ +#define B_INVALONERR 0x00040000 /* Invalidate on write error. */ #define B_00080000 0x00080000 /* Available flag. */ #define B_00100000 0x00100000 /* Available flag. */ #define B_00200000 0x00200000 /* Available flag. */ #define B_RELBUF 0x00400000 /* Release VMIO buffer. */ #define B_FS_FLAG1 0x00800000 /* Available flag for FS use. */ #define B_NOCOPY 0x01000000 /* Don't copy-on-write this buf. */ #define B_INFREECNT 0x02000000 /* buf is counted in numfreebufs */ #define B_PAGING 0x04000000 /* volatile paging I/O -- bypass VMIO */ #define B_MANAGED 0x08000000 /* Managed by FS. */ #define B_RAM 0x10000000 /* Read ahead mark (flag) */ #define B_VMIO 0x20000000 /* VMIO flag */ #define B_CLUSTER 0x40000000 /* pagein op, so swap() can count it */ #define B_REMFREE 0x80000000 /* Delayed bremfree */ #define PRINT_BUF_FLAGS "\20\40remfree\37cluster\36vmio\35ram\34managed" \ "\33paging\32infreecnt\31nocopy\30b23\27relbuf\26b21\25b20" \ - "\24b19\23b18\22clusterok\21malloc\20nocache\17b14\16inval" \ + "\24b19\23invalonerr\22clusterok\21malloc\20nocache\17b14\16inval" \ "\15reuse\14noreuse\13eintr\12done\11b8\10delwri" \ "\7validsuspwrt\6cache\5deferred\4direct\3async\2needcommit\1age" /* * These flags are kept in b_xflags. * * BX_FSPRIV reserves a set of eight flags that may be used by individual * filesystems for their own purpose. Their specific definitions are * found in the header files for each filesystem that uses them. */ #define BX_VNDIRTY 0x00000001 /* On vnode dirty list */ #define BX_VNCLEAN 0x00000002 /* On vnode clean list */ #define BX_BKGRDWRITE 0x00000010 /* Do writes in background */ #define BX_BKGRDMARKER 0x00000020 /* Mark buffer for splay tree */ #define BX_ALTDATA 0x00000040 /* Holds extended data */ #define BX_FSPRIV 0x00FF0000 /* filesystem-specific flags mask */ #define PRINT_BUF_XFLAGS "\20\7altdata\6bkgrdmarker\5bkgrdwrite\2clean\1dirty" #define NOOFFSET (-1LL) /* No buffer offset calculated yet */ /* * These flags are kept in b_vflags. */ #define BV_SCANNED 0x00000001 /* VOP_FSYNC funcs mark written bufs */ #define BV_BKGRDINPROG 0x00000002 /* Background write in progress */ #define BV_BKGRDWAIT 0x00000004 /* Background write waiting */ #define BV_BKGRDERR 0x00000008 /* Error from background write */ #define PRINT_BUF_VFLAGS "\20\4bkgrderr\3bkgrdwait\2bkgrdinprog\1scanned" #ifdef _KERNEL #ifndef NSWBUF_MIN #define NSWBUF_MIN 16 #endif /* * Buffer locking */ extern const char *buf_wmesg; /* Default buffer lock message */ #define BUF_WMESG "bufwait" #include /* XXX for curthread */ #include /* * Initialize a lock. */ #define BUF_LOCKINIT(bp) \ lockinit(&(bp)->b_lock, PRIBIO + 4, buf_wmesg, 0, LK_NEW) /* * * Get a lock sleeping non-interruptably until it becomes available. */ #define BUF_LOCK(bp, locktype, interlock) \ _lockmgr_args_rw(&(bp)->b_lock, (locktype), (interlock), \ LK_WMESG_DEFAULT, LK_PRIO_DEFAULT, LK_TIMO_DEFAULT, \ LOCK_FILE, LOCK_LINE) /* * Get a lock sleeping with specified interruptably and timeout. */ #define BUF_TIMELOCK(bp, locktype, interlock, wmesg, catch, timo) \ _lockmgr_args_rw(&(bp)->b_lock, (locktype) | LK_TIMELOCK, \ (interlock), (wmesg), (PRIBIO + 4) | (catch), (timo), \ LOCK_FILE, LOCK_LINE) /* * Release a lock. Only the acquiring process may free the lock unless * it has been handed off to biodone. */ #define BUF_UNLOCK(bp) do { \ KASSERT(((bp)->b_flags & B_REMFREE) == 0, \ ("BUF_UNLOCK %p while B_REMFREE is still set.", (bp))); \ \ (void)_lockmgr_args(&(bp)->b_lock, LK_RELEASE, NULL, \ LK_WMESG_DEFAULT, LK_PRIO_DEFAULT, LK_TIMO_DEFAULT, \ LOCK_FILE, LOCK_LINE); \ } while (0) /* * Check if a buffer lock is recursed. */ #define BUF_LOCKRECURSED(bp) \ lockmgr_recursed(&(bp)->b_lock) /* * Check if a buffer lock is currently held. */ #define BUF_ISLOCKED(bp) \ lockstatus(&(bp)->b_lock) /* * Free a buffer lock. */ #define BUF_LOCKFREE(bp) \ lockdestroy(&(bp)->b_lock) /* * Print informations on a buffer lock. */ #define BUF_LOCKPRINTINFO(bp) \ lockmgr_printinfo(&(bp)->b_lock) /* * Buffer lock assertions. */ #if defined(INVARIANTS) && defined(INVARIANT_SUPPORT) #define BUF_ASSERT_LOCKED(bp) \ _lockmgr_assert(&(bp)->b_lock, KA_LOCKED, LOCK_FILE, LOCK_LINE) #define BUF_ASSERT_SLOCKED(bp) \ _lockmgr_assert(&(bp)->b_lock, KA_SLOCKED, LOCK_FILE, LOCK_LINE) #define BUF_ASSERT_XLOCKED(bp) \ _lockmgr_assert(&(bp)->b_lock, KA_XLOCKED, LOCK_FILE, LOCK_LINE) #define BUF_ASSERT_UNLOCKED(bp) \ _lockmgr_assert(&(bp)->b_lock, KA_UNLOCKED, LOCK_FILE, LOCK_LINE) #else #define BUF_ASSERT_LOCKED(bp) #define BUF_ASSERT_SLOCKED(bp) #define BUF_ASSERT_XLOCKED(bp) #define BUF_ASSERT_UNLOCKED(bp) #endif #ifdef _SYS_PROC_H_ /* Avoid #include pollution */ /* * When initiating asynchronous I/O, change ownership of the lock to the * kernel. Once done, the lock may legally released by biodone. The * original owning process can no longer acquire it recursively, but must * wait until the I/O is completed and the lock has been freed by biodone. */ #define BUF_KERNPROC(bp) \ _lockmgr_disown(&(bp)->b_lock, LOCK_FILE, LOCK_LINE) #endif #endif /* _KERNEL */ struct buf_queue_head { TAILQ_HEAD(buf_queue, buf) queue; daddr_t last_pblkno; struct buf *insert_point; struct buf *switch_point; }; /* * This structure describes a clustered I/O. */ struct cluster_save { long bs_bcount; /* Saved b_bcount. */ long bs_bufsize; /* Saved b_bufsize. */ int bs_nchildren; /* Number of associated buffers. */ struct buf **bs_children; /* List of associated buffers. */ }; #ifdef _KERNEL static __inline int bwrite(struct buf *bp) { KASSERT(bp->b_bufobj != NULL, ("bwrite: no bufobj bp=%p", bp)); KASSERT(bp->b_bufobj->bo_ops != NULL, ("bwrite: no bo_ops bp=%p", bp)); KASSERT(bp->b_bufobj->bo_ops->bop_write != NULL, ("bwrite: no bop_write bp=%p", bp)); return (BO_WRITE(bp->b_bufobj, bp)); } static __inline void bstrategy(struct buf *bp) { KASSERT(bp->b_bufobj != NULL, ("bstrategy: no bufobj bp=%p", bp)); KASSERT(bp->b_bufobj->bo_ops != NULL, ("bstrategy: no bo_ops bp=%p", bp)); KASSERT(bp->b_bufobj->bo_ops->bop_strategy != NULL, ("bstrategy: no bop_strategy bp=%p", bp)); BO_STRATEGY(bp->b_bufobj, bp); } static __inline void buf_start(struct buf *bp) { if (bioops.io_start) (*bioops.io_start)(bp); } static __inline void buf_complete(struct buf *bp) { if (bioops.io_complete) (*bioops.io_complete)(bp); } static __inline void buf_deallocate(struct buf *bp) { if (bioops.io_deallocate) (*bioops.io_deallocate)(bp); } static __inline int buf_countdeps(struct buf *bp, int i) { if (bioops.io_countdeps) return ((*bioops.io_countdeps)(bp, i)); else return (0); } static __inline void buf_track(struct buf *bp, const char *location) { #if defined(FULL_BUF_TRACKING) bp->b_io_tracking[BUF_TRACKING_ENTRY(bp->b_io_tcnt++)] = location; #elif defined(BUF_TRACKING) bp->b_io_tracking = location; #endif } #endif /* _KERNEL */ /* * Zero out the buffer's data area. */ #define clrbuf(bp) { \ bzero((bp)->b_data, (u_int)(bp)->b_bcount); \ (bp)->b_resid = 0; \ } /* * Flags for getblk's last parameter. */ #define GB_LOCK_NOWAIT 0x0001 /* Fail if we block on a buf lock. */ #define GB_NOCREAT 0x0002 /* Don't create a buf if not found. */ #define GB_NOWAIT_BD 0x0004 /* Do not wait for bufdaemon. */ #define GB_UNMAPPED 0x0008 /* Do not mmap buffer pages. */ #define GB_KVAALLOC 0x0010 /* But allocate KVA. */ #define GB_CKHASH 0x0020 /* If reading, calc checksum hash */ #define GB_NOSPARSE 0x0040 /* Do not instantiate holes */ #ifdef _KERNEL extern int nbuf; /* The number of buffer headers */ extern long maxswzone; /* Max KVA for swap structures */ extern long maxbcache; /* Max KVA for buffer cache */ extern int maxbcachebuf; /* Max buffer cache block size */ extern long runningbufspace; extern long hibufspace; extern int dirtybufthresh; extern int bdwriteskip; extern int dirtybufferflushes; extern int altbufferflushes; extern int nswbuf; /* Number of swap I/O buffer headers. */ extern caddr_t unmapped_buf; /* Data address for unmapped buffers. */ static inline int buf_mapped(struct buf *bp) { return (bp->b_data != unmapped_buf); } void runningbufwakeup(struct buf *); void waitrunningbufspace(void); caddr_t kern_vfs_bio_buffer_alloc(caddr_t v, long physmem_est); void bufinit(void); void bufshutdown(int); void bdata2bio(struct buf *bp, struct bio *bip); void bwillwrite(void); int buf_dirty_count_severe(void); void bremfree(struct buf *); void bremfreef(struct buf *); /* XXX Force bremfree, only for nfs. */ #define bread(vp, blkno, size, cred, bpp) \ breadn_flags(vp, blkno, size, NULL, NULL, 0, cred, 0, NULL, bpp) #define bread_gb(vp, blkno, size, cred, gbflags, bpp) \ breadn_flags(vp, blkno, size, NULL, NULL, 0, cred, \ gbflags, NULL, bpp) #define breadn(vp, blkno, size, rablkno, rabsize, cnt, cred, bpp) \ breadn_flags(vp, blkno, size, rablkno, rabsize, cnt, cred, \ 0, NULL, bpp) int breadn_flags(struct vnode *, daddr_t, int, daddr_t *, int *, int, struct ucred *, int, void (*)(struct buf *), struct buf **); void bdwrite(struct buf *); void bawrite(struct buf *); void babarrierwrite(struct buf *); int bbarrierwrite(struct buf *); void bdirty(struct buf *); void bundirty(struct buf *); void bufstrategy(struct bufobj *, struct buf *); void brelse(struct buf *); void bqrelse(struct buf *); int vfs_bio_awrite(struct buf *); void vfs_drain_busy_pages(struct buf *bp); struct buf *incore(struct bufobj *, daddr_t); struct buf *gbincore(struct bufobj *, daddr_t); struct buf *getblk(struct vnode *, daddr_t, int, int, int, int); int getblkx(struct vnode *vp, daddr_t blkno, int size, int slpflag, int slptimeo, int flags, struct buf **bpp); struct buf *geteblk(int, int); int bufwait(struct buf *); int bufwrite(struct buf *); void bufdone(struct buf *); void bd_speedup(void); extern uma_zone_t pbuf_zone; uma_zone_t pbuf_zsecond_create(char *name, int max); int cluster_read(struct vnode *, u_quad_t, daddr_t, long, struct ucred *, long, int, int, struct buf **); int cluster_wbuild(struct vnode *, long, daddr_t, int, int); void cluster_write(struct vnode *, struct buf *, u_quad_t, int, int); void vfs_bio_brelse(struct buf *bp, int ioflags); void vfs_bio_bzero_buf(struct buf *bp, int base, int size); void vfs_bio_clrbuf(struct buf *); void vfs_bio_set_flags(struct buf *bp, int ioflags); void vfs_bio_set_valid(struct buf *, int base, int size); void vfs_busy_pages(struct buf *, int clear_modify); void vfs_unbusy_pages(struct buf *); int vmapbuf(struct buf *, int); void vunmapbuf(struct buf *); void brelvp(struct buf *); void bgetvp(struct vnode *, struct buf *); void pbgetbo(struct bufobj *bo, struct buf *bp); void pbgetvp(struct vnode *, struct buf *); void pbrelbo(struct buf *); void pbrelvp(struct buf *); int allocbuf(struct buf *bp, int size); void reassignbuf(struct buf *); void bwait(struct buf *, u_char, const char *); void bdone(struct buf *); typedef daddr_t (vbg_get_lblkno_t)(struct vnode *, vm_ooffset_t); typedef int (vbg_get_blksize_t)(struct vnode *, daddr_t); int vfs_bio_getpages(struct vnode *vp, struct vm_page **ma, int count, int *rbehind, int *rahead, vbg_get_lblkno_t get_lblkno, vbg_get_blksize_t get_blksize); #endif /* _KERNEL */ #endif /* !_SYS_BUF_H_ */ Index: head/usr.sbin/makefs/msdos/msdosfs_denode.c =================================================================== --- head/usr.sbin/makefs/msdos/msdosfs_denode.c (revision 352232) +++ head/usr.sbin/makefs/msdos/msdosfs_denode.c (revision 352233) @@ -1,381 +1,382 @@ /* $NetBSD: msdosfs_denode.c,v 1.7 2015/03/29 05:52:59 agc Exp $ */ /*- * SPDX-License-Identifier: BSD-4-Clause * * Copyright (C) 1994, 1995, 1997 Wolfgang Solfrank. * Copyright (C) 1994, 1995, 1997 TooLs GmbH. * All rights reserved. * Original code by Paul Popelka (paulp@uts.amdahl.com) (see below). * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by TooLs GmbH. * 4. The name of TooLs GmbH may not be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY TOOLS GMBH ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL TOOLS GMBH BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /*- * Written by Paul Popelka (paulp@uts.amdahl.com) * * You can do anything you want with this software, just don't say you wrote * it, and don't remove this notice. * * This software is provided "as is". * * The author supplies this software to be publicly redistributed on the * understanding that the author is not responsible for the correct * functioning of this software in any circumstances and is not liable for * any damages caused by this software. * * October 1992 */ #include __FBSDID("$FreeBSD$"); #include #include #include +#include #include #include #include #include #include "ffs/buf.h" #include #include #include #include #include #include "makefs.h" #include "msdos.h" /* * If deget() succeeds it returns with the gotten denode locked(). * * pmp - address of msdosfsmount structure of the filesystem containing * the denode of interest. The pm_dev field and the address of * the msdosfsmount structure are used. * dirclust - which cluster bp contains, if dirclust is 0 (root directory) * diroffset is relative to the beginning of the root directory, * otherwise it is cluster relative. * diroffset - offset past begin of cluster of denode we want * depp - returns the address of the gotten denode. */ int deget(struct msdosfsmount *pmp, u_long dirclust, u_long diroffset, struct denode **depp) { int error; uint64_t inode; struct direntry *direntptr; struct denode *ldep; struct buf *bp; MSDOSFS_DPRINTF(("deget(pmp %p, dirclust %lu, diroffset %lx, depp %p)\n", pmp, dirclust, diroffset, depp)); /* * On FAT32 filesystems, root is a (more or less) normal * directory */ if (FAT32(pmp) && dirclust == MSDOSFSROOT) dirclust = pmp->pm_rootdirblk; inode = (uint64_t)pmp->pm_bpcluster * dirclust + diroffset; ldep = ecalloc(1, sizeof(*ldep)); ldep->de_vnode = NULL; ldep->de_flag = 0; ldep->de_dirclust = dirclust; ldep->de_diroffset = diroffset; ldep->de_inode = inode; ldep->de_pmp = pmp; ldep->de_refcnt = 1; fc_purge(ldep, 0); /* init the FAT cache for this denode */ /* * Copy the directory entry into the denode area of the vnode. */ if ((dirclust == MSDOSFSROOT || (FAT32(pmp) && dirclust == pmp->pm_rootdirblk)) && diroffset == MSDOSFSROOT_OFS) { /* * Directory entry for the root directory. There isn't one, * so we manufacture one. We should probably rummage * through the root directory and find a label entry (if it * exists), and then use the time and date from that entry * as the time and date for the root denode. */ ldep->de_vnode = (struct vnode *)-1; ldep->de_Attributes = ATTR_DIRECTORY; ldep->de_LowerCase = 0; if (FAT32(pmp)) ldep->de_StartCluster = pmp->pm_rootdirblk; /* de_FileSize will be filled in further down */ else { ldep->de_StartCluster = MSDOSFSROOT; ldep->de_FileSize = pmp->pm_rootdirsize * DEV_BSIZE; } /* * fill in time and date so that dos2unixtime() doesn't * spit up when called from msdosfs_getattr() with root * denode */ ldep->de_CHun = 0; ldep->de_CTime = 0x0000; /* 00:00:00 */ ldep->de_CDate = (0 << DD_YEAR_SHIFT) | (1 << DD_MONTH_SHIFT) | (1 << DD_DAY_SHIFT); /* Jan 1, 1980 */ ldep->de_ADate = ldep->de_CDate; ldep->de_MTime = ldep->de_CTime; ldep->de_MDate = ldep->de_CDate; /* leave the other fields as garbage */ } else { error = readep(pmp, dirclust, diroffset, &bp, &direntptr); if (error) { ldep->de_Name[0] = SLOT_DELETED; *depp = NULL; return (error); } (void)DE_INTERNALIZE(ldep, direntptr); brelse(bp); } /* * Fill in a few fields of the vnode and finish filling in the * denode. Then return the address of the found denode. */ if (ldep->de_Attributes & ATTR_DIRECTORY) { /* * Since DOS directory entries that describe directories * have 0 in the filesize field, we take this opportunity * to find out the length of the directory and plug it into * the denode structure. */ u_long size; /* * XXX it sometimes happens that the "." entry has cluster * number 0 when it shouldn't. Use the actual cluster number * instead of what is written in directory entry. */ if (diroffset == 0 && ldep->de_StartCluster != dirclust) { MSDOSFS_DPRINTF(("deget(): \".\" entry at clust %lu != %lu\n", dirclust, ldep->de_StartCluster)); ldep->de_StartCluster = dirclust; } if (ldep->de_StartCluster != MSDOSFSROOT) { error = pcbmap(ldep, 0xffff, 0, &size, 0); if (error == E2BIG) { ldep->de_FileSize = de_cn2off(pmp, size); error = 0; } else { MSDOSFS_DPRINTF(("deget(): pcbmap returned %d\n", error)); } } } *depp = ldep; return (0); } /* * Truncate the file described by dep to the length specified by length. */ int detrunc(struct denode *dep, u_long length, int flags, struct ucred *cred) { int error; int allerror; u_long eofentry; u_long chaintofree; daddr_t bn; int boff; int isadir = dep->de_Attributes & ATTR_DIRECTORY; struct buf *bp; struct msdosfsmount *pmp = dep->de_pmp; MSDOSFS_DPRINTF(("detrunc(): file %s, length %lu, flags %x\n", dep->de_Name, length, flags)); /* * Disallow attempts to truncate the root directory since it is of * fixed size. That's just the way dos filesystems are. We use * the VROOT bit in the vnode because checking for the directory * bit and a startcluster of 0 in the denode is not adequate to * recognize the root directory at this point in a file or * directory's life. */ if (dep->de_vnode != NULL && !FAT32(pmp)) { MSDOSFS_DPRINTF(("detrunc(): can't truncate root directory, " "clust %ld, offset %ld\n", dep->de_dirclust, dep->de_diroffset)); return (EINVAL); } if (dep->de_FileSize < length) return deextend(dep, length, cred); /* * If the desired length is 0 then remember the starting cluster of * the file and set the StartCluster field in the directory entry * to 0. If the desired length is not zero, then get the number of * the last cluster in the shortened file. Then get the number of * the first cluster in the part of the file that is to be freed. * Then set the next cluster pointer in the last cluster of the * file to CLUST_EOFE. */ if (length == 0) { chaintofree = dep->de_StartCluster; dep->de_StartCluster = 0; eofentry = ~0; } else { error = pcbmap(dep, de_clcount(pmp, length) - 1, 0, &eofentry, 0); if (error) { MSDOSFS_DPRINTF(("detrunc(): pcbmap fails %d\n", error)); return (error); } } fc_purge(dep, de_clcount(pmp, length)); /* * If the new length is not a multiple of the cluster size then we * must zero the tail end of the new last cluster in case it * becomes part of the file again because of a seek. */ if ((boff = length & pmp->pm_crbomask) != 0) { if (isadir) { bn = cntobn(pmp, eofentry); error = bread(pmp->pm_devvp, bn, pmp->pm_bpcluster, 0, &bp); if (error) { brelse(bp); MSDOSFS_DPRINTF(("detrunc(): bread fails %d\n", error)); return (error); } memset(bp->b_data + boff, 0, pmp->pm_bpcluster - boff); if (flags & IO_SYNC) bwrite(bp); else bdwrite(bp); } } /* * Write out the updated directory entry. Even if the update fails * we free the trailing clusters. */ dep->de_FileSize = length; if (!isadir) dep->de_flag |= DE_UPDATE|DE_MODIFIED; MSDOSFS_DPRINTF(("detrunc(): allerror %d, eofentry %lu\n", allerror, eofentry)); /* * If we need to break the cluster chain for the file then do it * now. */ if (eofentry != ~0) { error = fatentry(FAT_GET_AND_SET, pmp, eofentry, &chaintofree, CLUST_EOFE); if (error) { MSDOSFS_DPRINTF(("detrunc(): fatentry errors %d\n", error)); return (error); } fc_setcache(dep, FC_LASTFC, de_cluster(pmp, length - 1), eofentry); } /* * Now free the clusters removed from the file because of the * truncation. */ if (chaintofree != 0 && !MSDOSFSEOF(pmp, chaintofree)) freeclusterchain(pmp, chaintofree); return (allerror); } /* * Extend the file described by dep to length specified by length. */ int deextend(struct denode *dep, u_long length, struct ucred *cred) { struct msdosfsmount *pmp = dep->de_pmp; u_long count; int error; /* * The root of a DOS filesystem cannot be extended. */ if (dep->de_vnode != NULL && !FAT32(pmp)) return (EINVAL); /* * Directories cannot be extended. */ if (dep->de_Attributes & ATTR_DIRECTORY) return (EISDIR); if (length <= dep->de_FileSize) return (E2BIG); /* * Compute the number of clusters to allocate. */ count = de_clcount(pmp, length) - de_clcount(pmp, dep->de_FileSize); if (count > 0) { if (count > pmp->pm_freeclustercount) return (ENOSPC); error = extendfile(dep, count, NULL, NULL, DE_CLEAR); if (error) { /* truncate the added clusters away again */ (void) detrunc(dep, dep->de_FileSize, 0, cred); return (error); } } /* * Zero extend file range; ubc_zerorange() uses ubc_alloc() and a * memset(); we set the write size so ubc won't read in file data that * is zero'd later. */ dep->de_FileSize = length; dep->de_flag |= DE_UPDATE | DE_MODIFIED; return 0; } Index: head/usr.sbin/makefs/msdos/msdosfs_fat.c =================================================================== --- head/usr.sbin/makefs/msdos/msdosfs_fat.c (revision 352232) +++ head/usr.sbin/makefs/msdos/msdosfs_fat.c (revision 352233) @@ -1,1056 +1,1057 @@ /* $FreeBSD$ */ /* $NetBSD: msdosfs_fat.c,v 1.28 1997/11/17 15:36:49 ws Exp $ */ /*- * SPDX-License-Identifier: BSD-4-Clause * * Copyright (C) 1994, 1995, 1997 Wolfgang Solfrank. * Copyright (C) 1994, 1995, 1997 TooLs GmbH. * All rights reserved. * Original code by Paul Popelka (paulp@uts.amdahl.com) (see below). * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by TooLs GmbH. * 4. The name of TooLs GmbH may not be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY TOOLS GMBH ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL TOOLS GMBH BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /*- * Written by Paul Popelka (paulp@uts.amdahl.com) * * You can do anything you want with this software, just don't say you wrote * it, and don't remove this notice. * * This software is provided "as is". * * The author supplies this software to be publicly redistributed on the * understanding that the author is not responsible for the correct * functioning of this software in any circumstances and is not liable for * any damages caused by this software. * * October 1992 */ #include #include #include +#include #include #include #include #include "ffs/buf.h" #include #include "msdos/direntry.h" #include #include #include #include "makefs.h" #include "msdos.h" #define FULL_RUN ((u_int)0xffffffff) #define SYNCHRONOUS_WRITES(pmp) 1 static int chainalloc(struct msdosfsmount *pmp, u_long start, u_long count, u_long fillwith, u_long *retcluster, u_long *got); static int chainlength(struct msdosfsmount *pmp, u_long start, u_long count); static void fatblock(struct msdosfsmount *pmp, u_long ofs, u_long *bnp, u_long *sizep, u_long *bop); static int fatchain(struct msdosfsmount *pmp, u_long start, u_long count, u_long fillwith); static void fc_lookup(struct denode *dep, u_long findcn, u_long *frcnp, u_long *fsrcnp); static void updatefats(struct msdosfsmount *pmp, struct buf *bp, u_long fatbn); static __inline void usemap_alloc(struct msdosfsmount *pmp, u_long cn); static __inline void usemap_free(struct msdosfsmount *pmp, u_long cn); static int clusteralloc1(struct msdosfsmount *pmp, u_long start, u_long count, u_long fillwith, u_long *retcluster, u_long *got); static void fatblock(struct msdosfsmount *pmp, u_long ofs, u_long *bnp, u_long *sizep, u_long *bop) { u_long bn, size; bn = ofs / pmp->pm_fatblocksize * pmp->pm_fatblocksec; size = MIN(pmp->pm_fatblocksec, pmp->pm_FATsecs - bn) * DEV_BSIZE; bn += pmp->pm_fatblk + pmp->pm_curfat * pmp->pm_FATsecs; if (bnp) *bnp = bn; if (sizep) *sizep = size; if (bop) *bop = ofs % pmp->pm_fatblocksize; } /* * Map the logical cluster number of a file into a physical disk sector * that is filesystem relative. * * dep - address of denode representing the file of interest * findcn - file relative cluster whose filesystem relative cluster number * and/or block number are/is to be found * bnp - address of where to place the filesystem relative block number. * If this pointer is null then don't return this quantity. * cnp - address of where to place the filesystem relative cluster number. * If this pointer is null then don't return this quantity. * sp - pointer to returned block size * * NOTE: Either bnp or cnp must be non-null. * This function has one side effect. If the requested file relative cluster * is beyond the end of file, then the actual number of clusters in the file * is returned in *cnp. This is useful for determining how long a directory is. * If cnp is null, nothing is returned. */ int pcbmap(struct denode *dep, u_long findcn, daddr_t *bnp, u_long *cnp, int *sp) { int error; u_long i; u_long cn; u_long prevcn = 0; /* XXX: prevcn could be used unititialized */ u_long byteoffset; u_long bn; u_long bo; struct buf *bp = NULL; u_long bp_bn = -1; struct msdosfsmount *pmp = dep->de_pmp; u_long bsize; assert(bnp != NULL || cnp != NULL || sp != NULL); cn = dep->de_StartCluster; /* * The "file" that makes up the root directory is contiguous, * permanently allocated, of fixed size, and is not made up of * clusters. If the cluster number is beyond the end of the root * directory, then return the number of clusters in the file. */ if (cn == MSDOSFSROOT) { if (dep->de_Attributes & ATTR_DIRECTORY) { if (de_cn2off(pmp, findcn) >= dep->de_FileSize) { if (cnp) *cnp = de_bn2cn(pmp, pmp->pm_rootdirsize); return (E2BIG); } if (bnp) *bnp = pmp->pm_rootdirblk + de_cn2bn(pmp, findcn); if (cnp) *cnp = MSDOSFSROOT; if (sp) *sp = MIN(pmp->pm_bpcluster, dep->de_FileSize - de_cn2off(pmp, findcn)); return (0); } else { /* just an empty file */ if (cnp) *cnp = 0; return (E2BIG); } } /* * All other files do I/O in cluster sized blocks */ if (sp) *sp = pmp->pm_bpcluster; /* * Rummage around in the FAT cache, maybe we can avoid tromping * through every FAT entry for the file. And, keep track of how far * off the cache was from where we wanted to be. */ i = 0; fc_lookup(dep, findcn, &i, &cn); /* * Handle all other files or directories the normal way. */ for (; i < findcn; i++) { /* * Stop with all reserved clusters, not just with EOF. */ if ((cn | ~pmp->pm_fatmask) >= CLUST_RSRVD) goto hiteof; byteoffset = FATOFS(pmp, cn); fatblock(pmp, byteoffset, &bn, &bsize, &bo); if (bn != bp_bn) { if (bp) brelse(bp); error = bread(pmp->pm_devvp, bn, bsize, NOCRED, &bp); if (error) { brelse(bp); return (error); } bp_bn = bn; } prevcn = cn; if (bo >= bsize) { if (bp) brelse(bp); return (EIO); } if (FAT32(pmp)) cn = getulong(bp->b_data + bo); else cn = getushort(bp->b_data + bo); if (FAT12(pmp) && (prevcn & 1)) cn >>= 4; cn &= pmp->pm_fatmask; /* * Force the special cluster numbers * to be the same for all cluster sizes * to let the rest of msdosfs handle * all cases the same. */ if ((cn | ~pmp->pm_fatmask) >= CLUST_RSRVD) cn |= ~pmp->pm_fatmask; } if (!MSDOSFSEOF(pmp, cn)) { if (bp) brelse(bp); if (bnp) *bnp = cntobn(pmp, cn); if (cnp) *cnp = cn; fc_setcache(dep, FC_LASTMAP, i, cn); return (0); } hiteof:; if (cnp) *cnp = i; if (bp) brelse(bp); /* update last file cluster entry in the FAT cache */ fc_setcache(dep, FC_LASTFC, i - 1, prevcn); return (E2BIG); } /* * Find the closest entry in the FAT cache to the cluster we are looking * for. */ static void fc_lookup(struct denode *dep, u_long findcn, u_long *frcnp, u_long *fsrcnp) { int i; u_long cn; struct fatcache *closest = NULL; for (i = 0; i < FC_SIZE; i++) { cn = dep->de_fc[i].fc_frcn; if (cn != FCE_EMPTY && cn <= findcn) { if (closest == NULL || cn > closest->fc_frcn) closest = &dep->de_fc[i]; } } if (closest) { *frcnp = closest->fc_frcn; *fsrcnp = closest->fc_fsrcn; } } /* * Purge the FAT cache in denode dep of all entries relating to file * relative cluster frcn and beyond. */ void fc_purge(struct denode *dep, u_int frcn) { int i; struct fatcache *fcp; fcp = dep->de_fc; for (i = 0; i < FC_SIZE; i++, fcp++) { if (fcp->fc_frcn >= frcn) fcp->fc_frcn = FCE_EMPTY; } } /* * Update the FAT. * If mirroring the FAT, update all copies, with the first copy as last. * Else update only the current FAT (ignoring the others). * * pmp - msdosfsmount structure for filesystem to update * bp - addr of modified FAT block * fatbn - block number relative to begin of filesystem of the modified FAT block. */ static void updatefats(struct msdosfsmount *pmp, struct buf *bp, u_long fatbn) { struct buf *bpn; int cleanfat, i; #ifdef MSDOSFS_DEBUG printf("updatefats(pmp %p, bp %p, fatbn %lu)\n", pmp, bp, fatbn); #endif if (pmp->pm_flags & MSDOSFS_FATMIRROR) { /* * Now copy the block(s) of the modified FAT to the other copies of * the FAT and write them out. This is faster than reading in the * other FATs and then writing them back out. This could tie up * the FAT for quite a while. Preventing others from accessing it. * To prevent us from going after the FAT quite so much we use * delayed writes, unless they specified "synchronous" when the * filesystem was mounted. If synch is asked for then use * bwrite()'s and really slow things down. */ if (fatbn != pmp->pm_fatblk || FAT12(pmp)) cleanfat = 0; else if (FAT16(pmp)) cleanfat = 16; else cleanfat = 32; for (i = 1; i < pmp->pm_FATs; i++) { fatbn += pmp->pm_FATsecs; /* getblk() never fails */ bpn = getblk(pmp->pm_devvp, fatbn, bp->b_bcount, 0, 0, 0); memcpy(bpn->b_data, bp->b_data, bp->b_bcount); /* Force the clean bit on in the other copies. */ if (cleanfat == 16) ((uint8_t *)bpn->b_data)[3] |= 0x80; else if (cleanfat == 32) ((uint8_t *)bpn->b_data)[7] |= 0x08; if (SYNCHRONOUS_WRITES(pmp)) bwrite(bpn); else bdwrite(bpn); } } /* * Write out the first (or current) FAT last. */ if (SYNCHRONOUS_WRITES(pmp)) bwrite(bp); else bdwrite(bp); } /* * Updating entries in 12 bit FATs is a pain in the butt. * * The following picture shows where nibbles go when moving from a 12 bit * cluster number into the appropriate bytes in the FAT. * * byte m byte m+1 byte m+2 * +----+----+ +----+----+ +----+----+ * | 0 1 | | 2 3 | | 4 5 | FAT bytes * +----+----+ +----+----+ +----+----+ * * +----+----+----+ +----+----+----+ * | 3 0 1 | | 4 5 2 | * +----+----+----+ +----+----+----+ * cluster n cluster n+1 * * Where n is even. m = n + (n >> 2) * */ static __inline void usemap_alloc(struct msdosfsmount *pmp, u_long cn) { assert(cn <= pmp->pm_maxcluster); assert((pmp->pm_flags & MSDOSFSMNT_RONLY) == 0); assert((pmp->pm_inusemap[cn / N_INUSEBITS] & (1 << (cn % N_INUSEBITS))) == 0); assert(pmp->pm_freeclustercount > 0); pmp->pm_inusemap[cn / N_INUSEBITS] |= 1U << (cn % N_INUSEBITS); pmp->pm_freeclustercount--; pmp->pm_flags |= MSDOSFS_FSIMOD; } static __inline void usemap_free(struct msdosfsmount *pmp, u_long cn) { assert(cn <= pmp->pm_maxcluster); assert((pmp->pm_flags & MSDOSFSMNT_RONLY) == 0); assert((pmp->pm_inusemap[cn / N_INUSEBITS] & (1 << (cn % N_INUSEBITS))) != 0); pmp->pm_freeclustercount++; pmp->pm_flags |= MSDOSFS_FSIMOD; pmp->pm_inusemap[cn / N_INUSEBITS] &= ~(1U << (cn % N_INUSEBITS)); } int clusterfree(struct msdosfsmount *pmp, u_long cluster, u_long *oldcnp) { int error; u_long oldcn; error = fatentry(FAT_GET_AND_SET, pmp, cluster, &oldcn, MSDOSFSFREE); if (error) return (error); /* * If the cluster was successfully marked free, then update * the count of free clusters, and turn off the "allocated" * bit in the "in use" cluster bit map. */ usemap_free(pmp, cluster); if (oldcnp) *oldcnp = oldcn; return (0); } /* * Get or Set or 'Get and Set' the cluster'th entry in the FAT. * * function - whether to get or set a FAT entry * pmp - address of the msdosfsmount structure for the filesystem * whose FAT is to be manipulated. * cn - which cluster is of interest * oldcontents - address of a word that is to receive the contents of the * cluster'th entry if this is a get function * newcontents - the new value to be written into the cluster'th element of * the FAT if this is a set function. * * This function can also be used to free a cluster by setting the FAT entry * for a cluster to 0. * * All copies of the FAT are updated if this is a set function. NOTE: If * fatentry() marks a cluster as free it does not update the inusemap in * the msdosfsmount structure. This is left to the caller. */ int fatentry(int function, struct msdosfsmount *pmp, u_long cn, u_long *oldcontents, u_long newcontents) { int error; u_long readcn; u_long bn, bo, bsize, byteoffset; struct buf *bp; #ifdef MSDOSFS_DEBUG printf("fatentry(func %d, pmp %p, clust %lu, oldcon %p, newcon %lx)\n", function, pmp, cn, oldcontents, newcontents); #endif #ifdef DIAGNOSTIC /* * Be sure they asked us to do something. */ if ((function & (FAT_SET | FAT_GET)) == 0) { #ifdef MSDOSFS_DEBUG printf("fatentry(): function code doesn't specify get or set\n"); #endif return (EINVAL); } /* * If they asked us to return a cluster number but didn't tell us * where to put it, give them an error. */ if ((function & FAT_GET) && oldcontents == NULL) { #ifdef MSDOSFS_DEBUG printf("fatentry(): get function with no place to put result\n"); #endif return (EINVAL); } #endif /* * Be sure the requested cluster is in the filesystem. */ if (cn < CLUST_FIRST || cn > pmp->pm_maxcluster) return (EINVAL); byteoffset = FATOFS(pmp, cn); fatblock(pmp, byteoffset, &bn, &bsize, &bo); error = bread(pmp->pm_devvp, bn, bsize, NOCRED, &bp); if (error) { brelse(bp); return (error); } if (function & FAT_GET) { if (FAT32(pmp)) readcn = getulong(bp->b_data + bo); else readcn = getushort(bp->b_data + bo); if (FAT12(pmp) & (cn & 1)) readcn >>= 4; readcn &= pmp->pm_fatmask; /* map reserved FAT entries to same values for all FATs */ if ((readcn | ~pmp->pm_fatmask) >= CLUST_RSRVD) readcn |= ~pmp->pm_fatmask; *oldcontents = readcn; } if (function & FAT_SET) { switch (pmp->pm_fatmask) { case FAT12_MASK: readcn = getushort(bp->b_data + bo); if (cn & 1) { readcn &= 0x000f; readcn |= newcontents << 4; } else { readcn &= 0xf000; readcn |= newcontents & 0xfff; } putushort(bp->b_data + bo, readcn); break; case FAT16_MASK: putushort(bp->b_data + bo, newcontents); break; case FAT32_MASK: /* * According to spec we have to retain the * high order bits of the FAT entry. */ readcn = getulong(bp->b_data + bo); readcn &= ~FAT32_MASK; readcn |= newcontents & FAT32_MASK; putulong(bp->b_data + bo, readcn); break; } updatefats(pmp, bp, bn); bp = NULL; pmp->pm_fmod = 1; } if (bp) brelse(bp); return (0); } /* * Update a contiguous cluster chain * * pmp - mount point * start - first cluster of chain * count - number of clusters in chain * fillwith - what to write into FAT entry of last cluster */ static int fatchain(struct msdosfsmount *pmp, u_long start, u_long count, u_long fillwith) { int error; u_long bn, bo, bsize, byteoffset, readcn, newc; struct buf *bp; #ifdef MSDOSFS_DEBUG printf("fatchain(pmp %p, start %lu, count %lu, fillwith %lx)\n", pmp, start, count, fillwith); #endif /* * Be sure the clusters are in the filesystem. */ if (start < CLUST_FIRST || start + count - 1 > pmp->pm_maxcluster) return (EINVAL); while (count > 0) { byteoffset = FATOFS(pmp, start); fatblock(pmp, byteoffset, &bn, &bsize, &bo); error = bread(pmp->pm_devvp, bn, bsize, NOCRED, &bp); if (error) { brelse(bp); return (error); } while (count > 0) { start++; newc = --count > 0 ? start : fillwith; switch (pmp->pm_fatmask) { case FAT12_MASK: readcn = getushort(bp->b_data + bo); if (start & 1) { readcn &= 0xf000; readcn |= newc & 0xfff; } else { readcn &= 0x000f; readcn |= newc << 4; } putushort(bp->b_data + bo, readcn); bo++; if (!(start & 1)) bo++; break; case FAT16_MASK: putushort(bp->b_data + bo, newc); bo += 2; break; case FAT32_MASK: readcn = getulong(bp->b_data + bo); readcn &= ~pmp->pm_fatmask; readcn |= newc & pmp->pm_fatmask; putulong(bp->b_data + bo, readcn); bo += 4; break; } if (bo >= bsize) break; } updatefats(pmp, bp, bn); } pmp->pm_fmod = 1; return (0); } /* * Check the length of a free cluster chain starting at start. * * pmp - mount point * start - start of chain * count - maximum interesting length */ static int chainlength(struct msdosfsmount *pmp, u_long start, u_long count) { u_long idx, max_idx; u_int map; u_long len; if (start > pmp->pm_maxcluster) return (0); max_idx = pmp->pm_maxcluster / N_INUSEBITS; idx = start / N_INUSEBITS; start %= N_INUSEBITS; map = pmp->pm_inusemap[idx]; map &= ~((1 << start) - 1); if (map) { len = ffs(map) - 1 - start; len = MIN(len, count); if (start + len > pmp->pm_maxcluster) len = pmp->pm_maxcluster - start + 1; return (len); } len = N_INUSEBITS - start; if (len >= count) { len = count; if (start + len > pmp->pm_maxcluster) len = pmp->pm_maxcluster - start + 1; return (len); } while (++idx <= max_idx) { if (len >= count) break; map = pmp->pm_inusemap[idx]; if (map) { len += ffs(map) - 1; break; } len += N_INUSEBITS; } len = MIN(len, count); if (start + len > pmp->pm_maxcluster) len = pmp->pm_maxcluster - start + 1; return (len); } /* * Allocate contigous free clusters. * * pmp - mount point. * start - start of cluster chain. * count - number of clusters to allocate. * fillwith - put this value into the FAT entry for the * last allocated cluster. * retcluster - put the first allocated cluster's number here. * got - how many clusters were actually allocated. */ static int chainalloc(struct msdosfsmount *pmp, u_long start, u_long count, u_long fillwith, u_long *retcluster, u_long *got) { int error; u_long cl, n; assert((pmp->pm_flags & MSDOSFSMNT_RONLY) == 0); for (cl = start, n = count; n-- > 0;) usemap_alloc(pmp, cl++); pmp->pm_nxtfree = start + count; if (pmp->pm_nxtfree > pmp->pm_maxcluster) pmp->pm_nxtfree = CLUST_FIRST; pmp->pm_flags |= MSDOSFS_FSIMOD; error = fatchain(pmp, start, count, fillwith); if (error != 0) { for (cl = start, n = count; n-- > 0;) usemap_free(pmp, cl++); return (error); } #ifdef MSDOSFS_DEBUG printf("clusteralloc(): allocated cluster chain at %lu (%lu clusters)\n", start, count); #endif if (retcluster) *retcluster = start; if (got) *got = count; return (0); } /* * Allocate contiguous free clusters. * * pmp - mount point. * start - preferred start of cluster chain. * count - number of clusters requested. * fillwith - put this value into the FAT entry for the * last allocated cluster. * retcluster - put the first allocated cluster's number here. * got - how many clusters were actually allocated. */ int clusteralloc(struct msdosfsmount *pmp, u_long start, u_long count, u_long fillwith, u_long *retcluster, u_long *got) { int error; error = clusteralloc1(pmp, start, count, fillwith, retcluster, got); return (error); } static int clusteralloc1(struct msdosfsmount *pmp, u_long start, u_long count, u_long fillwith, u_long *retcluster, u_long *got) { u_long idx; u_long len, newst, foundl, cn, l; u_long foundcn = 0; /* XXX: foundcn could be used unititialized */ u_int map; MSDOSFS_DPRINTF(("clusteralloc(): find %lu clusters\n", count)); if (start) { if ((len = chainlength(pmp, start, count)) >= count) return (chainalloc(pmp, start, count, fillwith, retcluster, got)); } else len = 0; newst = pmp->pm_nxtfree; foundl = 0; for (cn = newst; cn <= pmp->pm_maxcluster;) { idx = cn / N_INUSEBITS; map = pmp->pm_inusemap[idx]; map |= (1U << (cn % N_INUSEBITS)) - 1; if (map != FULL_RUN) { cn = idx * N_INUSEBITS + ffs(map ^ FULL_RUN) - 1; if ((l = chainlength(pmp, cn, count)) >= count) return (chainalloc(pmp, cn, count, fillwith, retcluster, got)); if (l > foundl) { foundcn = cn; foundl = l; } cn += l + 1; continue; } cn += N_INUSEBITS - cn % N_INUSEBITS; } for (cn = 0; cn < newst;) { idx = cn / N_INUSEBITS; map = pmp->pm_inusemap[idx]; map |= (1U << (cn % N_INUSEBITS)) - 1; if (map != FULL_RUN) { cn = idx * N_INUSEBITS + ffs(map ^ FULL_RUN) - 1; if ((l = chainlength(pmp, cn, count)) >= count) return (chainalloc(pmp, cn, count, fillwith, retcluster, got)); if (l > foundl) { foundcn = cn; foundl = l; } cn += l + 1; continue; } cn += N_INUSEBITS - cn % N_INUSEBITS; } if (!foundl) return (ENOSPC); if (len) return (chainalloc(pmp, start, len, fillwith, retcluster, got)); else return (chainalloc(pmp, foundcn, foundl, fillwith, retcluster, got)); } /* * Free a chain of clusters. * * pmp - address of the msdosfs mount structure for the filesystem * containing the cluster chain to be freed. * startcluster - number of the 1st cluster in the chain of clusters to be * freed. */ int freeclusterchain(struct msdosfsmount *pmp, u_long cluster) { int error; struct buf *bp = NULL; u_long bn, bo, bsize, byteoffset; u_long readcn, lbn = -1; while (cluster >= CLUST_FIRST && cluster <= pmp->pm_maxcluster) { byteoffset = FATOFS(pmp, cluster); fatblock(pmp, byteoffset, &bn, &bsize, &bo); if (lbn != bn) { if (bp) updatefats(pmp, bp, lbn); error = bread(pmp->pm_devvp, bn, bsize, NOCRED, &bp); if (error) { brelse(bp); return (error); } lbn = bn; } usemap_free(pmp, cluster); switch (pmp->pm_fatmask) { case FAT12_MASK: readcn = getushort(bp->b_data + bo); if (cluster & 1) { cluster = readcn >> 4; readcn &= 0x000f; readcn |= MSDOSFSFREE << 4; } else { cluster = readcn; readcn &= 0xf000; readcn |= MSDOSFSFREE & 0xfff; } putushort(bp->b_data + bo, readcn); break; case FAT16_MASK: cluster = getushort(bp->b_data + bo); putushort(bp->b_data + bo, MSDOSFSFREE); break; case FAT32_MASK: cluster = getulong(bp->b_data + bo); putulong(bp->b_data + bo, (MSDOSFSFREE & FAT32_MASK) | (cluster & ~FAT32_MASK)); break; } cluster &= pmp->pm_fatmask; if ((cluster | ~pmp->pm_fatmask) >= CLUST_RSRVD) cluster |= pmp->pm_fatmask; } if (bp) updatefats(pmp, bp, bn); return (0); } /* * Read in FAT blocks looking for free clusters. For every free cluster * found turn off its corresponding bit in the pm_inusemap. */ int fillinusemap(struct msdosfsmount *pmp) { struct buf *bp; u_long bn, bo, bsize, byteoffset, cn, readcn; int error; bp = NULL; /* * Mark all clusters in use, we mark the free ones in the FAT scan * loop further down. */ for (cn = 0; cn < (pmp->pm_maxcluster + N_INUSEBITS) / N_INUSEBITS; cn++) pmp->pm_inusemap[cn] = FULL_RUN; /* * Figure how many free clusters are in the filesystem by ripping * through the FAT counting the number of entries whose content is * zero. These represent free clusters. */ pmp->pm_freeclustercount = 0; for (cn = 0; cn <= pmp->pm_maxcluster; cn++) { byteoffset = FATOFS(pmp, cn); bo = byteoffset % pmp->pm_fatblocksize; if (bo == 0) { /* Read new FAT block */ if (bp != NULL) brelse(bp); fatblock(pmp, byteoffset, &bn, &bsize, NULL); error = bread(pmp->pm_devvp, bn, bsize, NOCRED, &bp); if (error != 0) return (error); } if (FAT32(pmp)) readcn = getulong(bp->b_data + bo); else readcn = getushort(bp->b_data + bo); if (FAT12(pmp) && (cn & 1)) readcn >>= 4; readcn &= pmp->pm_fatmask; /* * Check if the FAT ID matches the BPB's media descriptor and * all other bits are set to 1. */ if (cn == 0 && readcn != ((pmp->pm_fatmask & 0xffffff00) | pmp->pm_bpb.bpbMedia)) { #ifdef MSDOSFS_DEBUG printf("mountmsdosfs(): Media descriptor in BPB" "does not match FAT ID\n"); #endif brelse(bp); return (EINVAL); } else if (readcn == CLUST_FREE) usemap_free(pmp, cn); } if (bp != NULL) brelse(bp); for (cn = pmp->pm_maxcluster + 1; cn < (pmp->pm_maxcluster + N_INUSEBITS) / N_INUSEBITS; cn++) pmp->pm_inusemap[cn / N_INUSEBITS] |= 1U << (cn % N_INUSEBITS); return (0); } /* * Allocate a new cluster and chain it onto the end of the file. * * dep - the file to extend * count - number of clusters to allocate * bpp - where to return the address of the buf header for the first new * file block * ncp - where to put cluster number of the first newly allocated cluster * If this pointer is 0, do not return the cluster number. * flags - see fat.h * * NOTE: This function is not responsible for turning on the DE_UPDATE bit of * the de_flag field of the denode and it does not change the de_FileSize * field. This is left for the caller to do. */ int extendfile(struct denode *dep, u_long count, struct buf **bpp, u_long *ncp, int flags) { int error; u_long frcn; u_long cn, got; struct msdosfsmount *pmp = dep->de_pmp; struct buf *bp; /* * Don't try to extend the root directory */ if (dep->de_StartCluster == MSDOSFSROOT && (dep->de_Attributes & ATTR_DIRECTORY)) { #ifdef MSDOSFS_DEBUG printf("extendfile(): attempt to extend root directory\n"); #endif return (ENOSPC); } /* * If the "file's last cluster" cache entry is empty, and the file * is not empty, then fill the cache entry by calling pcbmap(). */ if (dep->de_fc[FC_LASTFC].fc_frcn == FCE_EMPTY && dep->de_StartCluster != 0) { error = pcbmap(dep, 0xffff, 0, &cn, 0); /* we expect it to return E2BIG */ if (error != E2BIG) return (error); } dep->de_fc[FC_NEXTTOLASTFC].fc_frcn = dep->de_fc[FC_LASTFC].fc_frcn; dep->de_fc[FC_NEXTTOLASTFC].fc_fsrcn = dep->de_fc[FC_LASTFC].fc_fsrcn; while (count > 0) { /* * Allocate a new cluster chain and cat onto the end of the * file. If the file is empty we make de_StartCluster point * to the new block. Note that de_StartCluster being 0 is * sufficient to be sure the file is empty since we exclude * attempts to extend the root directory above, and the root * dir is the only file with a startcluster of 0 that has * blocks allocated (sort of). */ if (dep->de_StartCluster == 0) cn = 0; else cn = dep->de_fc[FC_LASTFC].fc_fsrcn + 1; error = clusteralloc(pmp, cn, count, CLUST_EOFE, &cn, &got); if (error) return (error); count -= got; /* * Give them the filesystem relative cluster number if they want * it. */ if (ncp) { *ncp = cn; ncp = NULL; } if (dep->de_StartCluster == 0) { dep->de_StartCluster = cn; frcn = 0; } else { error = fatentry(FAT_SET, pmp, dep->de_fc[FC_LASTFC].fc_fsrcn, 0, cn); if (error) { clusterfree(pmp, cn, NULL); return (error); } frcn = dep->de_fc[FC_LASTFC].fc_frcn + 1; } /* * Update the "last cluster of the file" entry in the * denode's FAT cache. */ fc_setcache(dep, FC_LASTFC, frcn + got - 1, cn + got - 1); if ((flags & DE_CLEAR) && (dep->de_Attributes & ATTR_DIRECTORY)) { while (got-- > 0) { bp = getblk(pmp->pm_devvp, cntobn(pmp, cn++), pmp->pm_bpcluster, 0, 0, 0); clrbuf(bp); if (bpp) { *bpp = bp; bpp = NULL; } else { bdwrite(bp); } } } } return (0); } Index: head/usr.sbin/makefs/msdos/msdosfs_lookup.c =================================================================== --- head/usr.sbin/makefs/msdos/msdosfs_lookup.c (revision 352232) +++ head/usr.sbin/makefs/msdos/msdosfs_lookup.c (revision 352233) @@ -1,301 +1,302 @@ /* $FreeBSD$ */ /* $NetBSD: msdosfs_lookup.c,v 1.37 1997/11/17 15:36:54 ws Exp $ */ /*- * SPDX-License-Identifier: BSD-4-Clause * * Copyright (C) 1994, 1995, 1997 Wolfgang Solfrank. * Copyright (C) 1994, 1995, 1997 TooLs GmbH. * All rights reserved. * Original code by Paul Popelka (paulp@uts.amdahl.com) (see below). * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by TooLs GmbH. * 4. The name of TooLs GmbH may not be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY TOOLS GMBH ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL TOOLS GMBH BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /*- * Written by Paul Popelka (paulp@uts.amdahl.com) * * You can do anything you want with this software, just don't say you wrote * it, and don't remove this notice. * * This software is provided "as is". * * The author supplies this software to be publicly redistributed on the * understanding that the author is not responsible for the correct * functioning of this software in any circumstances and is not liable for * any damages caused by this software. * * October 1992 */ #include #include +#include #include #include #include "ffs/buf.h" #include #include "msdos/direntry.h" #include #include #include #include "makefs.h" #include "msdos.h" /* * dep - directory entry to copy into the directory * ddep - directory to add to * depp - return the address of the denode for the created directory entry * if depp != 0 * cnp - componentname needed for Win95 long filenames */ int createde(struct denode *dep, struct denode *ddep, struct denode **depp, struct componentname *cnp) { int error; u_long dirclust, diroffset; struct direntry *ndep; struct msdosfsmount *pmp = ddep->de_pmp; struct buf *bp; daddr_t bn; int blsize; MSDOSFS_DPRINTF(("createde(dep %p, ddep %p, depp %p, cnp %p)\n", dep, ddep, depp, cnp)); /* * If no space left in the directory then allocate another cluster * and chain it onto the end of the file. There is one exception * to this. That is, if the root directory has no more space it * can NOT be expanded. extendfile() checks for and fails attempts * to extend the root directory. We just return an error in that * case. */ if (ddep->de_fndoffset >= ddep->de_FileSize) { diroffset = ddep->de_fndoffset + sizeof(struct direntry) - ddep->de_FileSize; dirclust = de_clcount(pmp, diroffset); error = extendfile(ddep, dirclust, 0, 0, DE_CLEAR); if (error) { (void)detrunc(ddep, ddep->de_FileSize, 0, NULL); return error; } /* * Update the size of the directory */ ddep->de_FileSize += de_cn2off(pmp, dirclust); } /* * We just read in the cluster with space. Copy the new directory * entry in. Then write it to disk. NOTE: DOS directories * do not get smaller as clusters are emptied. */ error = pcbmap(ddep, de_cluster(pmp, ddep->de_fndoffset), &bn, &dirclust, &blsize); if (error) return error; diroffset = ddep->de_fndoffset; if (dirclust != MSDOSFSROOT) diroffset &= pmp->pm_crbomask; if ((error = bread(pmp->pm_devvp, bn, blsize, NOCRED, &bp)) != 0) { brelse(bp); return error; } ndep = bptoep(pmp, bp, ddep->de_fndoffset); DE_EXTERNALIZE(ndep, dep); /* * Now write the Win95 long name */ if (ddep->de_fndcnt > 0) { uint8_t chksum = winChksum(ndep->deName); const u_char *un = (const u_char *)cnp->cn_nameptr; int unlen = cnp->cn_namelen; int cnt = 1; while (--ddep->de_fndcnt >= 0) { if (!(ddep->de_fndoffset & pmp->pm_crbomask)) { if ((error = bwrite(bp)) != 0) return error; ddep->de_fndoffset -= sizeof(struct direntry); error = pcbmap(ddep, de_cluster(pmp, ddep->de_fndoffset), &bn, 0, &blsize); if (error) return error; error = bread(pmp->pm_devvp, bn, blsize, NOCRED, &bp); if (error) { brelse(bp); return error; } ndep = bptoep(pmp, bp, ddep->de_fndoffset); } else { ndep--; ddep->de_fndoffset -= sizeof(struct direntry); } if (!unix2winfn(un, unlen, (struct winentry *)ndep, cnt++, chksum)) break; } } if ((error = bwrite(bp)) != 0) return error; /* * If they want us to return with the denode gotten. */ if (depp) { if (dep->de_Attributes & ATTR_DIRECTORY) { dirclust = dep->de_StartCluster; if (FAT32(pmp) && dirclust == pmp->pm_rootdirblk) dirclust = MSDOSFSROOT; if (dirclust == MSDOSFSROOT) diroffset = MSDOSFSROOT_OFS; else diroffset = 0; } return deget(pmp, dirclust, diroffset, depp); } return 0; } /* * Read in the disk block containing the directory entry (dirclu, dirofs) * and return the address of the buf header, and the address of the * directory entry within the block. */ int readep(struct msdosfsmount *pmp, u_long dirclust, u_long diroffset, struct buf **bpp, struct direntry **epp) { int error; daddr_t bn; int blsize; blsize = pmp->pm_bpcluster; if (dirclust == MSDOSFSROOT && de_blk(pmp, diroffset + blsize) > pmp->pm_rootdirsize) blsize = de_bn2off(pmp, pmp->pm_rootdirsize) & pmp->pm_crbomask; bn = detobn(pmp, dirclust, diroffset); if ((error = bread(pmp->pm_devvp, bn, blsize, NOCRED, bpp)) != 0) { brelse(*bpp); *bpp = NULL; return (error); } if (epp) *epp = bptoep(pmp, *bpp, diroffset); return (0); } /* * Read in the disk block containing the directory entry dep came from and * return the address of the buf header, and the address of the directory * entry within the block. */ int readde(struct denode *dep, struct buf **bpp, struct direntry **epp) { return (readep(dep->de_pmp, dep->de_dirclust, dep->de_diroffset, bpp, epp)); } /* * Create a unique DOS name in dvp */ int uniqdosname(struct denode *dep, struct componentname *cnp, u_char *cp) { struct msdosfsmount *pmp = dep->de_pmp; struct direntry *dentp; int gen; int blsize; u_long cn; daddr_t bn; struct buf *bp; int error; if (pmp->pm_flags & MSDOSFSMNT_SHORTNAME) return (unix2dosfn((const u_char *)cnp->cn_nameptr, cp, cnp->cn_namelen, 0) ? 0 : EINVAL); for (gen = 1;; gen++) { /* * Generate DOS name with generation number */ if (!unix2dosfn((const u_char *)cnp->cn_nameptr, cp, cnp->cn_namelen, gen)) return gen == 1 ? EINVAL : EEXIST; /* * Now look for a dir entry with this exact name */ for (cn = error = 0; !error; cn++) { if ((error = pcbmap(dep, cn, &bn, 0, &blsize)) != 0) { if (error == E2BIG) /* EOF reached and not found */ return 0; return error; } error = bread(pmp->pm_devvp, bn, blsize, NOCRED, &bp); if (error) { brelse(bp); return error; } for (dentp = (struct direntry *)bp->b_data; (char *)dentp < bp->b_data + blsize; dentp++) { if (dentp->deName[0] == SLOT_EMPTY) { /* * Last used entry and not found */ brelse(bp); return 0; } /* * Ignore volume labels and Win95 entries */ if (dentp->deAttributes & ATTR_VOLUME) continue; if (!bcmp(dentp->deName, cp, 11)) { error = EEXIST; break; } } brelse(bp); } } } Index: head/usr.sbin/makefs/msdos/msdosfs_vfsops.c =================================================================== --- head/usr.sbin/makefs/msdos/msdosfs_vfsops.c (revision 352232) +++ head/usr.sbin/makefs/msdos/msdosfs_vfsops.c (revision 352233) @@ -1,391 +1,392 @@ /*- * SPDX-License-Identifier: BSD-4-Clause * * Copyright (C) 1994, 1995, 1997 Wolfgang Solfrank. * Copyright (C) 1994, 1995, 1997 TooLs GmbH. * All rights reserved. * Original code by Paul Popelka (paulp@uts.amdahl.com) (see below). * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by TooLs GmbH. * 4. The name of TooLs GmbH may not be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY TOOLS GMBH ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL TOOLS GMBH BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /*- * Written by Paul Popelka (paulp@uts.amdahl.com) * * You can do anything you want with this software, just don't say you wrote * it, and don't remove this notice. * * This software is provided "as is". * * The author supplies this software to be publicly redistributed on the * understanding that the author is not responsible for the correct * functioning of this software in any circumstances and is not liable for * any damages caused by this software. * * October 1992 */ #include /* $NetBSD: msdosfs_vfsops.c,v 1.10 2016/01/30 09:59:27 mlelstv Exp $ */ __FBSDID("$FreeBSD$"); #include #include #include +#include #include #include #include #include #include "ffs/buf.h" #include #include #include "msdos/direntry.h" #include #include #include #include #include "makefs.h" #include "msdos.h" struct msdosfsmount * msdosfs_mount(struct vnode *devvp) { struct msdosfsmount *pmp = NULL; struct buf *bp; union bootsector *bsp; struct byte_bpb33 *b33; struct byte_bpb50 *b50; struct byte_bpb710 *b710; uint8_t SecPerClust; int ronly = 0, error; int bsize; unsigned secsize = 512; MSDOSFS_DPRINTF(("%s(bread 0)\n", __func__)); if ((error = bread(devvp, 0, secsize, 0, &bp)) != 0) goto error_exit; bsp = (union bootsector *)bp->b_data; b33 = (struct byte_bpb33 *)bsp->bs33.bsBPB; b50 = (struct byte_bpb50 *)bsp->bs50.bsBPB; b710 = (struct byte_bpb710 *)bsp->bs710.bsBPB; if (bsp->bs50.bsBootSectSig0 != BOOTSIG0 || bsp->bs50.bsBootSectSig1 != BOOTSIG1) { MSDOSFS_DPRINTF(("bootsig0 %d bootsig1 %d\n", bsp->bs50.bsBootSectSig0, bsp->bs50.bsBootSectSig1)); error = EINVAL; goto error_exit; } bsize = 0; pmp = ecalloc(1, sizeof(*pmp)); /* * Compute several useful quantities from the bpb in the * bootsector. Copy in the dos 5 variant of the bpb then fix up * the fields that are different between dos 5 and dos 3.3. */ SecPerClust = b50->bpbSecPerClust; pmp->pm_BytesPerSec = getushort(b50->bpbBytesPerSec); pmp->pm_ResSectors = getushort(b50->bpbResSectors); pmp->pm_FATs = b50->bpbFATs; pmp->pm_RootDirEnts = getushort(b50->bpbRootDirEnts); pmp->pm_Sectors = getushort(b50->bpbSectors); pmp->pm_FATsecs = getushort(b50->bpbFATsecs); pmp->pm_SecPerTrack = getushort(b50->bpbSecPerTrack); pmp->pm_Heads = getushort(b50->bpbHeads); pmp->pm_Media = b50->bpbMedia; MSDOSFS_DPRINTF(("%s(BytesPerSec=%u, ResSectors=%u, FATs=%d, " "RootDirEnts=%u, Sectors=%u, FATsecs=%lu, SecPerTrack=%u, " "Heads=%u, Media=%u)\n", __func__, pmp->pm_BytesPerSec, pmp->pm_ResSectors, pmp->pm_FATs, pmp->pm_RootDirEnts, pmp->pm_Sectors, pmp->pm_FATsecs, pmp->pm_SecPerTrack, pmp->pm_Heads, pmp->pm_Media)); /* XXX - We should probably check more values here */ if (!pmp->pm_BytesPerSec || !SecPerClust || pmp->pm_SecPerTrack > 63) { MSDOSFS_DPRINTF(("bytespersec %d secperclust %d " "secpertrack %d\n", pmp->pm_BytesPerSec, SecPerClust, pmp->pm_SecPerTrack)); error = EINVAL; goto error_exit; } if (pmp->pm_Sectors == 0) { pmp->pm_HiddenSects = getulong(b50->bpbHiddenSecs); pmp->pm_HugeSectors = getulong(b50->bpbHugeSectors); } else { pmp->pm_HiddenSects = getushort(b33->bpbHiddenSecs); pmp->pm_HugeSectors = pmp->pm_Sectors; } pmp->pm_flags = 0; if (pmp->pm_RootDirEnts == 0) { unsigned short vers = getushort(b710->bpbFSVers); /* * Some say that bsBootSectSig[23] must be zero, but * Windows does not require this and some digital cameras * do not set these to zero. Therefore, do not insist. */ if (pmp->pm_Sectors || pmp->pm_FATsecs || vers) { MSDOSFS_DPRINTF(("sectors %d fatsecs %lu vers %d\n", pmp->pm_Sectors, pmp->pm_FATsecs, vers)); error = EINVAL; goto error_exit; } pmp->pm_fatmask = FAT32_MASK; pmp->pm_fatmult = 4; pmp->pm_fatdiv = 1; pmp->pm_FATsecs = getulong(b710->bpbBigFATsecs); /* mirrorring is enabled if the FATMIRROR bit is not set */ if ((getushort(b710->bpbExtFlags) & FATMIRROR) == 0) pmp->pm_flags |= MSDOSFS_FATMIRROR; else pmp->pm_curfat = getushort(b710->bpbExtFlags) & FATNUM; } else pmp->pm_flags |= MSDOSFS_FATMIRROR; /* Check that fs has nonzero FAT size */ if (pmp->pm_FATsecs == 0) { MSDOSFS_DPRINTF(("FATsecs is 0\n")); error = EINVAL; goto error_exit; } pmp->pm_fatblk = pmp->pm_ResSectors; if (FAT32(pmp)) { pmp->pm_rootdirblk = getulong(b710->bpbRootClust); pmp->pm_firstcluster = pmp->pm_fatblk + (pmp->pm_FATs * pmp->pm_FATsecs); pmp->pm_fsinfo = getushort(b710->bpbFSInfo); } else { pmp->pm_rootdirblk = pmp->pm_fatblk + (pmp->pm_FATs * pmp->pm_FATsecs); pmp->pm_rootdirsize = (pmp->pm_RootDirEnts * sizeof(struct direntry) + pmp->pm_BytesPerSec - 1) / pmp->pm_BytesPerSec;/* in sectors */ pmp->pm_firstcluster = pmp->pm_rootdirblk + pmp->pm_rootdirsize; } pmp->pm_maxcluster = ((pmp->pm_HugeSectors - pmp->pm_firstcluster) / SecPerClust) + 1; pmp->pm_fatsize = pmp->pm_FATsecs * pmp->pm_BytesPerSec; if (pmp->pm_fatmask == 0) { if (pmp->pm_maxcluster <= ((CLUST_RSRVD - CLUST_FIRST) & FAT12_MASK)) { /* * This will usually be a floppy disk. This size makes * sure that one FAT entry will not be split across * multiple blocks. */ pmp->pm_fatmask = FAT12_MASK; pmp->pm_fatmult = 3; pmp->pm_fatdiv = 2; } else { pmp->pm_fatmask = FAT16_MASK; pmp->pm_fatmult = 2; pmp->pm_fatdiv = 1; } } if (FAT12(pmp)) pmp->pm_fatblocksize = 3 * pmp->pm_BytesPerSec; else pmp->pm_fatblocksize = MAXBSIZE; pmp->pm_fatblocksec = pmp->pm_fatblocksize / pmp->pm_BytesPerSec; pmp->pm_bnshift = ffs(pmp->pm_BytesPerSec) - 1; /* * Compute mask and shift value for isolating cluster relative byte * offsets and cluster numbers from a file offset. */ pmp->pm_bpcluster = SecPerClust * pmp->pm_BytesPerSec; pmp->pm_crbomask = pmp->pm_bpcluster - 1; pmp->pm_cnshift = ffs(pmp->pm_bpcluster) - 1; MSDOSFS_DPRINTF(("%s(fatmask=%lu, fatmult=%u, fatdiv=%u, " "fatblocksize=%lu, fatblocksec=%lu, bnshift=%lu, pbcluster=%lu, " "crbomask=%lu, cnshift=%lu)\n", __func__, (unsigned long)pmp->pm_fatmask, pmp->pm_fatmult, pmp->pm_fatdiv, pmp->pm_fatblocksize, pmp->pm_fatblocksec, pmp->pm_bnshift, pmp->pm_bpcluster, pmp->pm_crbomask, pmp->pm_cnshift)); /* * Check for valid cluster size * must be a power of 2 */ if (pmp->pm_bpcluster ^ (1 << pmp->pm_cnshift)) { MSDOSFS_DPRINTF(("bpcluster %lu cnshift %lu\n", pmp->pm_bpcluster, pmp->pm_cnshift)); error = EINVAL; goto error_exit; } /* * Release the bootsector buffer. */ brelse(bp); bp = NULL; /* * Check FSInfo. */ if (pmp->pm_fsinfo) { struct fsinfo *fp; /* * XXX If the fsinfo block is stored on media with * 2KB or larger sectors, is the fsinfo structure * padded at the end or in the middle? */ if ((error = bread(devvp, pmp->pm_fsinfo, pmp->pm_BytesPerSec, 0, &bp)) != 0) goto error_exit; fp = (struct fsinfo *)bp->b_data; if (!memcmp(fp->fsisig1, "RRaA", 4) && !memcmp(fp->fsisig2, "rrAa", 4) && !memcmp(fp->fsisig3, "\0\0\125\252", 4)) pmp->pm_nxtfree = getulong(fp->fsinxtfree); else pmp->pm_fsinfo = 0; brelse(bp); bp = NULL; } /* * Check and validate (or perhaps invalidate?) the fsinfo structure? * XXX */ if (pmp->pm_fsinfo) { if ((pmp->pm_nxtfree == 0xffffffffUL) || (pmp->pm_nxtfree > pmp->pm_maxcluster)) pmp->pm_fsinfo = 0; } /* * Allocate memory for the bitmap of allocated clusters, and then * fill it in. */ pmp->pm_inusemap = ecalloc(sizeof(*pmp->pm_inusemap), ((pmp->pm_maxcluster + N_INUSEBITS) / N_INUSEBITS)); /* * fillinusemap() needs pm_devvp. */ pmp->pm_dev = 0; pmp->pm_devvp = devvp; /* * Have the inuse map filled in. */ if ((error = fillinusemap(pmp)) != 0) { MSDOSFS_DPRINTF(("fillinusemap %d\n", error)); goto error_exit; } /* * Finish up. */ if (ronly) pmp->pm_flags |= MSDOSFSMNT_RONLY; else pmp->pm_fmod = 1; /* * If we ever do quotas for DOS filesystems this would be a place * to fill in the info in the msdosfsmount structure. You dolt, * quotas on dos filesystems make no sense because files have no * owners on dos filesystems. of course there is some empty space * in the directory entry where we could put uid's and gid's. */ return pmp; error_exit: if (bp) brelse(bp); if (pmp) { if (pmp->pm_inusemap) free(pmp->pm_inusemap); free(pmp); } errno = error; return NULL; } int msdosfs_root(struct msdosfsmount *pmp, struct vnode *vp) { struct denode *ndep; int error; *vp = *pmp->pm_devvp; if ((error = deget(pmp, MSDOSFSROOT, MSDOSFSROOT_OFS, &ndep)) != 0) { errno = error; return -1; } vp->v_data = ndep; return 0; } /* * If we have an FSInfo block, update it. */ int msdosfs_fsiflush(struct msdosfsmount *pmp) { struct fsinfo *fp; struct buf *bp; int error; if (pmp->pm_fsinfo == 0 || (pmp->pm_flags & MSDOSFS_FSIMOD) == 0) { error = 0; goto out; } error = bread(pmp->pm_devvp, pmp->pm_fsinfo, pmp->pm_BytesPerSec, NOCRED, &bp); if (error != 0) { brelse(bp); goto out; } fp = (struct fsinfo *)bp->b_data; putulong(fp->fsinfree, pmp->pm_freeclustercount); putulong(fp->fsinxtfree, pmp->pm_nxtfree); pmp->pm_flags &= ~MSDOSFS_FSIMOD; error = bwrite(bp); out: return (error); } Index: head/usr.sbin/makefs/msdos/msdosfs_vnops.c =================================================================== --- head/usr.sbin/makefs/msdos/msdosfs_vnops.c (revision 352232) +++ head/usr.sbin/makefs/msdos/msdosfs_vnops.c (revision 352233) @@ -1,640 +1,641 @@ /* $NetBSD: msdosfs_vnops.c,v 1.19 2017/04/13 17:10:12 christos Exp $ */ /*- * SPDX-License-Identifier: BSD-4-Clause * * Copyright (C) 1994, 1995, 1997 Wolfgang Solfrank. * Copyright (C) 1994, 1995, 1997 TooLs GmbH. * All rights reserved. * Original code by Paul Popelka (paulp@uts.amdahl.com) (see below). * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by TooLs GmbH. * 4. The name of TooLs GmbH may not be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY TOOLS GMBH ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL TOOLS GMBH BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /*- * Written by Paul Popelka (paulp@uts.amdahl.com) * * You can do anything you want with this software, just don't say you wrote * it, and don't remove this notice. * * This software is provided "as is". * * The author supplies this software to be publicly redistributed on the * understanding that the author is not responsible for the correct * functioning of this software in any circumstances and is not liable for * any damages caused by this software. * * October 1992 */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include +#include #include #include #include #include #include "ffs/buf.h" #include #include "msdos/direntry.h" #include #include #include #include "makefs.h" #include "msdos.h" /* * Some general notes: * * In the ufs filesystem the inodes, superblocks, and indirect blocks are * read/written using the vnode for the filesystem. Blocks that represent * the contents of a file are read/written using the vnode for the file * (including directories when they are read/written as files). This * presents problems for the dos filesystem because data that should be in * an inode (if dos had them) resides in the directory itself. Since we * must update directory entries without the benefit of having the vnode * for the directory we must use the vnode for the filesystem. This means * that when a directory is actually read/written (via read, write, or * readdir, or seek) we must use the vnode for the filesystem instead of * the vnode for the directory as would happen in ufs. This is to insure we * retrieve the correct block from the buffer cache since the hash value is * based upon the vnode address and the desired block number. */ static int msdosfs_wfile(const char *, struct denode *, fsnode *); static void unix2fattime(const struct timespec *tsp, uint16_t *ddp, uint16_t *dtp); static void msdosfs_times(struct denode *dep, const struct stat *st) { if (stampst.st_ino) st = &stampst; unix2fattime(&st->st_birthtim, &dep->de_CDate, &dep->de_CTime); unix2fattime(&st->st_atim, &dep->de_ADate, NULL); unix2fattime(&st->st_mtim, &dep->de_MDate, &dep->de_MTime); } static void unix2fattime(const struct timespec *tsp, uint16_t *ddp, uint16_t *dtp) { time_t t1; struct tm lt = {0}; t1 = tsp->tv_sec; localtime_r(&t1, <); unsigned long fat_time = ((lt.tm_year - 80) << 25) | ((lt.tm_mon + 1) << 21) | (lt.tm_mday << 16) | (lt.tm_hour << 11) | (lt.tm_min << 5) | (lt.tm_sec >> 1); if (ddp != NULL) *ddp = (uint16_t)(fat_time >> 16); if (dtp != NULL) *dtp = (uint16_t)fat_time; } /* * When we search a directory the blocks containing directory entries are * read and examined. The directory entries contain information that would * normally be in the inode of a unix filesystem. This means that some of * a directory's contents may also be in memory resident denodes (sort of * an inode). This can cause problems if we are searching while some other * process is modifying a directory. To prevent one process from accessing * incompletely modified directory information we depend upon being the * sole owner of a directory block. bread/brelse provide this service. * This being the case, when a process modifies a directory it must first * acquire the disk block that contains the directory entry to be modified. * Then update the disk block and the denode, and then write the disk block * out to disk. This way disk blocks containing directory entries and in * memory denode's will be in synch. */ static int msdosfs_findslot(struct denode *dp, struct componentname *cnp) { daddr_t bn; int error; int slotcount; int slotoffset = 0; int frcn; u_long cluster; int blkoff; u_int diroff; int blsize; struct msdosfsmount *pmp; struct buf *bp = 0; struct direntry *dep; u_char dosfilename[12]; int wincnt = 1; int chksum = -1, chksum_ok; int olddos = 1; pmp = dp->de_pmp; switch (unix2dosfn((const u_char *)cnp->cn_nameptr, dosfilename, cnp->cn_namelen, 0)) { case 0: return (EINVAL); case 1: break; case 2: wincnt = winSlotCnt((const u_char *)cnp->cn_nameptr, cnp->cn_namelen) + 1; break; case 3: olddos = 0; wincnt = winSlotCnt((const u_char *)cnp->cn_nameptr, cnp->cn_namelen) + 1; break; } if (pmp->pm_flags & MSDOSFSMNT_SHORTNAME) wincnt = 1; /* * Suppress search for slots unless creating * file and at end of pathname, in which case * we watch for a place to put the new file in * case it doesn't already exist. */ slotcount = 0; MSDOSFS_DPRINTF(("%s(): dos filename: %s\n", __func__, dosfilename)); /* * Search the directory pointed at by vdp for the name pointed at * by cnp->cn_nameptr. */ /* * The outer loop ranges over the clusters that make up the * directory. Note that the root directory is different from all * other directories. It has a fixed number of blocks that are not * part of the pool of allocatable clusters. So, we treat it a * little differently. The root directory starts at "cluster" 0. */ diroff = 0; for (frcn = 0; diroff < dp->de_FileSize; frcn++) { if ((error = pcbmap(dp, frcn, &bn, &cluster, &blsize)) != 0) { if (error == E2BIG) break; return (error); } error = bread(pmp->pm_devvp, bn, blsize, 0, &bp); if (error) { return (error); } for (blkoff = 0; blkoff < blsize; blkoff += sizeof(struct direntry), diroff += sizeof(struct direntry)) { dep = (struct direntry *)(bp->b_data + blkoff); /* * If the slot is empty and we are still looking * for an empty then remember this one. If the * slot is not empty then check to see if it * matches what we are looking for. If the slot * has never been filled with anything, then the * remainder of the directory has never been used, * so there is no point in searching it. */ if (dep->deName[0] == SLOT_EMPTY || dep->deName[0] == SLOT_DELETED) { /* * Drop memory of previous long matches */ chksum = -1; if (slotcount < wincnt) { slotcount++; slotoffset = diroff; } if (dep->deName[0] == SLOT_EMPTY) { brelse(bp); goto notfound; } } else { /* * If there wasn't enough space for our * winentries, forget about the empty space */ if (slotcount < wincnt) slotcount = 0; /* * Check for Win95 long filename entry */ if (dep->deAttributes == ATTR_WIN95) { if (pmp->pm_flags & MSDOSFSMNT_SHORTNAME) continue; chksum = winChkName( (const u_char *)cnp->cn_nameptr, cnp->cn_namelen, (struct winentry *)dep, chksum); continue; } /* * Ignore volume labels (anywhere, not just * the root directory). */ if (dep->deAttributes & ATTR_VOLUME) { chksum = -1; continue; } /* * Check for a checksum or name match */ chksum_ok = (chksum == winChksum(dep->deName)); if (!chksum_ok && (!olddos || memcmp(dosfilename, dep->deName, 11))) { chksum = -1; continue; } MSDOSFS_DPRINTF(("%s(): match blkoff %d, diroff %u\n", __func__, blkoff, diroff)); /* * Remember where this directory * entry came from for whoever did * this lookup. */ dp->de_fndoffset = diroff; dp->de_fndcnt = 0; return EEXIST; } } /* for (blkoff = 0; .... */ /* * Release the buffer holding the directory cluster just * searched. */ brelse(bp); } /* for (frcn = 0; ; frcn++) */ notfound: /* * We hold no disk buffers at this point. */ /* * If we get here we didn't find the entry we were looking for. But * that's ok if we are creating or renaming and are at the end of * the pathname and the directory hasn't been removed. */ MSDOSFS_DPRINTF(("%s(): refcnt %ld, slotcount %d, slotoffset %d\n", __func__, dp->de_refcnt, slotcount, slotoffset)); /* * Fixup the slot description to point to the place where * we might put the new DOS direntry (putting the Win95 * long name entries before that) */ if (!slotcount) { slotcount = 1; slotoffset = diroff; } if (wincnt > slotcount) { slotoffset += sizeof(struct direntry) * (wincnt - slotcount); } /* * Return an indication of where the new directory * entry should be put. */ dp->de_fndoffset = slotoffset; dp->de_fndcnt = wincnt - 1; /* * We return with the directory locked, so that * the parameters we set up above will still be * valid if we actually decide to do a direnter(). * We return ni_vp == NULL to indicate that the entry * does not currently exist; we leave a pointer to * the (locked) directory inode in ndp->ni_dvp. * * NB - if the directory is unlocked, then this * information cannot be used. */ return 0; } /* * Create a regular file. On entry the directory to contain the file being * created is locked. We must release before we return. */ struct denode * msdosfs_mkfile(const char *path, struct denode *pdep, fsnode *node) { struct componentname cn; struct denode ndirent; struct denode *dep; int error; struct stat *st = &node->inode->st; cn.cn_nameptr = node->name; cn.cn_namelen = strlen(node->name); MSDOSFS_DPRINTF(("%s(name %s, mode 0%o size %zu)\n", __func__, node->name, st->st_mode, (size_t)st->st_size)); /* * If this is the root directory and there is no space left we * can't do anything. This is because the root directory can not * change size. */ if (pdep->de_StartCluster == MSDOSFSROOT && pdep->de_fndoffset >= pdep->de_FileSize) { error = ENOSPC; goto bad; } /* * Create a directory entry for the file, then call createde() to * have it installed. NOTE: DOS files are always executable. We * use the absence of the owner write bit to make the file * readonly. */ memset(&ndirent, 0, sizeof(ndirent)); if ((error = uniqdosname(pdep, &cn, ndirent.de_Name)) != 0) goto bad; ndirent.de_Attributes = (st->st_mode & S_IWUSR) ? ATTR_ARCHIVE : ATTR_ARCHIVE | ATTR_READONLY; ndirent.de_StartCluster = 0; ndirent.de_FileSize = 0; ndirent.de_pmp = pdep->de_pmp; ndirent.de_flag = DE_ACCESS | DE_CREATE | DE_UPDATE; msdosfs_times(&ndirent, &node->inode->st); if ((error = msdosfs_findslot(pdep, &cn)) != 0) goto bad; if ((error = createde(&ndirent, pdep, &dep, &cn)) != 0) goto bad; if ((error = msdosfs_wfile(path, dep, node)) != 0) goto bad; return dep; bad: errno = error; return NULL; } static int msdosfs_updatede(struct denode *dep) { struct buf *bp; struct direntry *dirp; int error; dep->de_flag &= ~DE_MODIFIED; error = readde(dep, &bp, &dirp); if (error) return error; DE_EXTERNALIZE(dirp, dep); error = bwrite(bp); return error; } /* * Write data to a file or directory. */ static int msdosfs_wfile(const char *path, struct denode *dep, fsnode *node) { int error, fd; size_t osize = dep->de_FileSize; struct stat *st = &node->inode->st; size_t nsize, offs; struct msdosfsmount *pmp = dep->de_pmp; struct buf *bp; char *dat; u_long cn = 0; error = 0; /* XXX: gcc/vax */ MSDOSFS_DPRINTF(("%s(diroff %lu, dirclust %lu, startcluster %lu)\n", __func__, dep->de_diroffset, dep->de_dirclust, dep->de_StartCluster)); if (st->st_size == 0) return 0; /* Don't bother to try to write files larger than the fs limit */ if (st->st_size > MSDOSFS_FILESIZE_MAX) return EFBIG; nsize = st->st_size; MSDOSFS_DPRINTF(("%s(nsize=%zu, osize=%zu)\n", __func__, nsize, osize)); if (nsize > osize) { if ((error = deextend(dep, nsize, NULL)) != 0) return error; if ((error = msdosfs_updatede(dep)) != 0) return error; } if ((fd = open(path, O_RDONLY)) == -1) { error = errno; MSDOSFS_DPRINTF(("open %s: %s", path, strerror(error))); return error; } if ((dat = mmap(0, nsize, PROT_READ, MAP_FILE | MAP_PRIVATE, fd, 0)) == MAP_FAILED) { error = errno; MSDOSFS_DPRINTF(("%s: mmap %s: %s", __func__, node->name, strerror(error))); close(fd); goto out; } close(fd); for (offs = 0; offs < nsize;) { int blsize, cpsize; daddr_t bn; u_long on = offs & pmp->pm_crbomask; if ((error = pcbmap(dep, cn++, &bn, NULL, &blsize)) != 0) { MSDOSFS_DPRINTF(("%s: pcbmap %lu", __func__, (unsigned long)bn)); goto out; } MSDOSFS_DPRINTF(("%s(cn=%lu, bn=%llu, blsize=%d)\n", __func__, cn, (unsigned long long)bn, blsize)); if ((error = bread(pmp->pm_devvp, bn, blsize, 0, &bp)) != 0) { MSDOSFS_DPRINTF(("bread %d\n", error)); goto out; } cpsize = MIN((nsize - offs), blsize - on); memcpy(bp->b_data + on, dat + offs, cpsize); bwrite(bp); offs += cpsize; } munmap(dat, nsize); return 0; out: munmap(dat, nsize); return error; } static const struct { struct direntry dot; struct direntry dotdot; } dosdirtemplate = { { ". ", /* the . entry */ ATTR_DIRECTORY, /* file attribute */ 0, /* reserved */ 0, { 0, 0 }, { 0, 0 }, /* create time & date */ { 0, 0 }, /* access date */ { 0, 0 }, /* high bits of start cluster */ { 210, 4 }, { 210, 4 }, /* modify time & date */ { 0, 0 }, /* startcluster */ { 0, 0, 0, 0 } /* filesize */ }, { ".. ", /* the .. entry */ ATTR_DIRECTORY, /* file attribute */ 0, /* reserved */ 0, { 0, 0 }, { 0, 0 }, /* create time & date */ { 0, 0 }, /* access date */ { 0, 0 }, /* high bits of start cluster */ { 210, 4 }, { 210, 4 }, /* modify time & date */ { 0, 0 }, /* startcluster */ { 0, 0, 0, 0 } /* filesize */ } }; struct denode * msdosfs_mkdire(const char *path, struct denode *pdep, fsnode *node) { struct denode ndirent; struct denode *dep; struct componentname cn; struct msdosfsmount *pmp = pdep->de_pmp; int error; u_long newcluster, pcl, bn; struct direntry *denp; struct buf *bp; cn.cn_nameptr = node->name; cn.cn_namelen = strlen(node->name); /* * If this is the root directory and there is no space left we * can't do anything. This is because the root directory can not * change size. */ if (pdep->de_StartCluster == MSDOSFSROOT && pdep->de_fndoffset >= pdep->de_FileSize) { error = ENOSPC; goto bad2; } /* * Allocate a cluster to hold the about to be created directory. */ error = clusteralloc(pmp, 0, 1, CLUST_EOFE, &newcluster, NULL); if (error) goto bad2; memset(&ndirent, 0, sizeof(ndirent)); ndirent.de_pmp = pmp; ndirent.de_flag = DE_ACCESS | DE_CREATE | DE_UPDATE; msdosfs_times(&ndirent, &node->inode->st); /* * Now fill the cluster with the "." and ".." entries. And write * the cluster to disk. This way it is there for the parent * directory to be pointing at if there were a crash. */ bn = cntobn(pmp, newcluster); MSDOSFS_DPRINTF(("%s(newcluster %lu, bn=%lu)\n", __func__, newcluster, bn)); /* always succeeds */ bp = getblk(pmp->pm_devvp, bn, pmp->pm_bpcluster, 0, 0, 0); memset(bp->b_data, 0, pmp->pm_bpcluster); memcpy(bp->b_data, &dosdirtemplate, sizeof dosdirtemplate); denp = (struct direntry *)bp->b_data; putushort(denp[0].deStartCluster, newcluster); putushort(denp[0].deCDate, ndirent.de_CDate); putushort(denp[0].deCTime, ndirent.de_CTime); denp[0].deCHundredth = ndirent.de_CHun; putushort(denp[0].deADate, ndirent.de_ADate); putushort(denp[0].deMDate, ndirent.de_MDate); putushort(denp[0].deMTime, ndirent.de_MTime); pcl = pdep->de_StartCluster; MSDOSFS_DPRINTF(("%s(pcl %lu, rootdirblk=%lu)\n", __func__, pcl, pmp->pm_rootdirblk)); if (FAT32(pmp) && pcl == pmp->pm_rootdirblk) pcl = 0; putushort(denp[1].deStartCluster, pcl); putushort(denp[1].deCDate, ndirent.de_CDate); putushort(denp[1].deCTime, ndirent.de_CTime); denp[1].deCHundredth = ndirent.de_CHun; putushort(denp[1].deADate, ndirent.de_ADate); putushort(denp[1].deMDate, ndirent.de_MDate); putushort(denp[1].deMTime, ndirent.de_MTime); if (FAT32(pmp)) { putushort(denp[0].deHighClust, newcluster >> 16); putushort(denp[1].deHighClust, pdep->de_StartCluster >> 16); } else { putushort(denp[0].deHighClust, 0); putushort(denp[1].deHighClust, 0); } if ((error = bwrite(bp)) != 0) goto bad; /* * Now build up a directory entry pointing to the newly allocated * cluster. This will be written to an empty slot in the parent * directory. */ if ((error = uniqdosname(pdep, &cn, ndirent.de_Name)) != 0) goto bad; ndirent.de_Attributes = ATTR_DIRECTORY; ndirent.de_StartCluster = newcluster; ndirent.de_FileSize = 0; ndirent.de_pmp = pdep->de_pmp; if ((error = msdosfs_findslot(pdep, &cn)) != 0) goto bad; if ((error = createde(&ndirent, pdep, &dep, &cn)) != 0) goto bad; if ((error = msdosfs_updatede(dep)) != 0) goto bad; return dep; bad: clusterfree(pmp, newcluster, NULL); bad2: errno = error; return NULL; }