Index: head/sys/cam/ctl/ctl_backend_block.c =================================================================== --- head/sys/cam/ctl/ctl_backend_block.c (revision 364371) +++ head/sys/cam/ctl/ctl_backend_block.c (revision 364372) @@ -1,2812 +1,2812 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2003 Silicon Graphics International Corp. * Copyright (c) 2009-2011 Spectra Logic Corporation * Copyright (c) 2012 The FreeBSD Foundation * Copyright (c) 2014-2015 Alexander Motin * All rights reserved. * * Portions of this software were developed by Edward Tomasz Napierala * under sponsorship from the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions, and the following disclaimer, * without modification. * 2. Redistributions in binary form must reproduce at minimum a disclaimer * substantially similar to the "NO WARRANTY" disclaimer below * ("Disclaimer") and any redistribution must be conditioned upon * including a substantially similar Disclaimer requirement for further * binary redistribution. * * NO WARRANTY * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGES. * * $Id: //depot/users/kenm/FreeBSD-test2/sys/cam/ctl/ctl_backend_block.c#5 $ */ /* * CAM Target Layer driver backend for block devices. * * Author: Ken Merry */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * The idea here is that we'll allocate enough S/G space to hold a 1MB * I/O. If we get an I/O larger than that, we'll split it. */ #define CTLBLK_HALF_IO_SIZE (512 * 1024) #define CTLBLK_MAX_IO_SIZE (CTLBLK_HALF_IO_SIZE * 2) #define CTLBLK_MAX_SEG MAXPHYS #define CTLBLK_HALF_SEGS MAX(CTLBLK_HALF_IO_SIZE / CTLBLK_MAX_SEG, 1) #define CTLBLK_MAX_SEGS (CTLBLK_HALF_SEGS * 2) #ifdef CTLBLK_DEBUG #define DPRINTF(fmt, args...) \ printf("cbb(%s:%d): " fmt, __FUNCTION__, __LINE__, ##args) #else #define DPRINTF(fmt, args...) do {} while(0) #endif #define PRIV(io) \ ((struct ctl_ptr_len_flags *)&(io)->io_hdr.ctl_private[CTL_PRIV_BACKEND]) #define ARGS(io) \ ((struct ctl_lba_len_flags *)&(io)->io_hdr.ctl_private[CTL_PRIV_LBA_LEN]) SDT_PROVIDER_DEFINE(cbb); typedef enum { CTL_BE_BLOCK_LUN_UNCONFIGURED = 0x01, CTL_BE_BLOCK_LUN_WAITING = 0x04, } ctl_be_block_lun_flags; typedef enum { CTL_BE_BLOCK_NONE, CTL_BE_BLOCK_DEV, CTL_BE_BLOCK_FILE } ctl_be_block_type; struct ctl_be_block_filedata { struct ucred *cred; }; union ctl_be_block_bedata { struct ctl_be_block_filedata file; }; struct ctl_be_block_io; struct ctl_be_block_lun; typedef void (*cbb_dispatch_t)(struct ctl_be_block_lun *be_lun, struct ctl_be_block_io *beio); typedef uint64_t (*cbb_getattr_t)(struct ctl_be_block_lun *be_lun, const char *attrname); /* * Backend LUN structure. There is a 1:1 mapping between a block device * and a backend block LUN, and between a backend block LUN and a CTL LUN. */ struct ctl_be_block_lun { struct ctl_be_lun cbe_lun; /* Must be first element. */ struct ctl_lun_create_params params; char *dev_path; ctl_be_block_type dev_type; struct vnode *vn; union ctl_be_block_bedata backend; cbb_dispatch_t dispatch; cbb_dispatch_t lun_flush; cbb_dispatch_t unmap; cbb_dispatch_t get_lba_status; cbb_getattr_t getattr; uint64_t size_blocks; uint64_t size_bytes; struct ctl_be_block_softc *softc; struct devstat *disk_stats; ctl_be_block_lun_flags flags; SLIST_ENTRY(ctl_be_block_lun) links; struct taskqueue *io_taskqueue; struct task io_task; int num_threads; STAILQ_HEAD(, ctl_io_hdr) input_queue; STAILQ_HEAD(, ctl_io_hdr) config_read_queue; STAILQ_HEAD(, ctl_io_hdr) config_write_queue; STAILQ_HEAD(, ctl_io_hdr) datamove_queue; struct mtx_padalign io_lock; struct mtx_padalign queue_lock; }; /* * Overall softc structure for the block backend module. */ struct ctl_be_block_softc { struct sx modify_lock; struct mtx lock; int num_luns; SLIST_HEAD(, ctl_be_block_lun) lun_list; uma_zone_t beio_zone; uma_zone_t buf_zone; }; static struct ctl_be_block_softc backend_block_softc; /* * Per-I/O information. */ struct ctl_be_block_io { union ctl_io *io; struct ctl_sg_entry sg_segs[CTLBLK_MAX_SEGS]; struct iovec xiovecs[CTLBLK_MAX_SEGS]; int refcnt; int bio_cmd; int two_sglists; int num_segs; int num_bios_sent; int num_bios_done; int send_complete; int first_error; uint64_t first_error_offset; struct bintime ds_t0; devstat_tag_type ds_tag_type; devstat_trans_flags ds_trans_type; uint64_t io_len; uint64_t io_offset; int io_arg; struct ctl_be_block_softc *softc; struct ctl_be_block_lun *lun; void (*beio_cont)(struct ctl_be_block_io *beio); /* to continue processing */ }; extern struct ctl_softc *control_softc; static int cbb_num_threads = 14; SYSCTL_NODE(_kern_cam_ctl, OID_AUTO, block, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "CAM Target Layer Block Backend"); SYSCTL_INT(_kern_cam_ctl_block, OID_AUTO, num_threads, CTLFLAG_RWTUN, &cbb_num_threads, 0, "Number of threads per backing file"); static struct ctl_be_block_io *ctl_alloc_beio(struct ctl_be_block_softc *softc); static void ctl_free_beio(struct ctl_be_block_io *beio); static void ctl_complete_beio(struct ctl_be_block_io *beio); static int ctl_be_block_move_done(union ctl_io *io); static void ctl_be_block_biodone(struct bio *bio); static void ctl_be_block_flush_file(struct ctl_be_block_lun *be_lun, struct ctl_be_block_io *beio); static void ctl_be_block_dispatch_file(struct ctl_be_block_lun *be_lun, struct ctl_be_block_io *beio); static void ctl_be_block_gls_file(struct ctl_be_block_lun *be_lun, struct ctl_be_block_io *beio); static uint64_t ctl_be_block_getattr_file(struct ctl_be_block_lun *be_lun, const char *attrname); static void ctl_be_block_flush_dev(struct ctl_be_block_lun *be_lun, struct ctl_be_block_io *beio); static void ctl_be_block_unmap_dev(struct ctl_be_block_lun *be_lun, struct ctl_be_block_io *beio); static void ctl_be_block_dispatch_dev(struct ctl_be_block_lun *be_lun, struct ctl_be_block_io *beio); static uint64_t ctl_be_block_getattr_dev(struct ctl_be_block_lun *be_lun, const char *attrname); static void ctl_be_block_cr_dispatch(struct ctl_be_block_lun *be_lun, union ctl_io *io); static void ctl_be_block_cw_dispatch(struct ctl_be_block_lun *be_lun, union ctl_io *io); static void ctl_be_block_dispatch(struct ctl_be_block_lun *be_lun, union ctl_io *io); static void ctl_be_block_worker(void *context, int pending); static int ctl_be_block_submit(union ctl_io *io); static int ctl_be_block_ioctl(struct cdev *dev, u_long cmd, caddr_t addr, int flag, struct thread *td); static int ctl_be_block_open_file(struct ctl_be_block_lun *be_lun, struct ctl_lun_req *req); static int ctl_be_block_open_dev(struct ctl_be_block_lun *be_lun, struct ctl_lun_req *req); static int ctl_be_block_close(struct ctl_be_block_lun *be_lun); static int ctl_be_block_open(struct ctl_be_block_lun *be_lun, struct ctl_lun_req *req); static int ctl_be_block_create(struct ctl_be_block_softc *softc, struct ctl_lun_req *req); static int ctl_be_block_rm(struct ctl_be_block_softc *softc, struct ctl_lun_req *req); static int ctl_be_block_modify(struct ctl_be_block_softc *softc, struct ctl_lun_req *req); static void ctl_be_block_lun_shutdown(struct ctl_be_lun *cbe_lun); static int ctl_be_block_config_write(union ctl_io *io); static int ctl_be_block_config_read(union ctl_io *io); static int ctl_be_block_lun_info(struct ctl_be_lun *cbe_lun, struct sbuf *sb); static uint64_t ctl_be_block_lun_attr(struct ctl_be_lun *cbe_lun, const char *attrname); static int ctl_be_block_init(void); static int ctl_be_block_shutdown(void); static struct ctl_backend_driver ctl_be_block_driver = { .name = "block", .flags = CTL_BE_FLAG_HAS_CONFIG, .init = ctl_be_block_init, .shutdown = ctl_be_block_shutdown, .data_submit = ctl_be_block_submit, .data_move_done = ctl_be_block_move_done, .config_read = ctl_be_block_config_read, .config_write = ctl_be_block_config_write, .ioctl = ctl_be_block_ioctl, .lun_info = ctl_be_block_lun_info, .lun_attr = ctl_be_block_lun_attr }; MALLOC_DEFINE(M_CTLBLK, "ctlblock", "Memory used for CTL block backend"); CTL_BACKEND_DECLARE(cbb, ctl_be_block_driver); static struct ctl_be_block_io * ctl_alloc_beio(struct ctl_be_block_softc *softc) { struct ctl_be_block_io *beio; beio = uma_zalloc(softc->beio_zone, M_WAITOK | M_ZERO); beio->softc = softc; beio->refcnt = 1; return (beio); } static void ctl_real_free_beio(struct ctl_be_block_io *beio) { struct ctl_be_block_softc *softc = beio->softc; int i; for (i = 0; i < beio->num_segs; i++) { uma_zfree(softc->buf_zone, beio->sg_segs[i].addr); /* For compare we had two equal S/G lists. */ if (beio->two_sglists) { uma_zfree(softc->buf_zone, beio->sg_segs[i + CTLBLK_HALF_SEGS].addr); } } uma_zfree(softc->beio_zone, beio); } static void ctl_refcnt_beio(void *arg, int diff) { struct ctl_be_block_io *beio = arg; if (atomic_fetchadd_int(&beio->refcnt, diff) + diff == 0) ctl_real_free_beio(beio); } static void ctl_free_beio(struct ctl_be_block_io *beio) { ctl_refcnt_beio(beio, -1); } static void ctl_complete_beio(struct ctl_be_block_io *beio) { union ctl_io *io = beio->io; if (beio->beio_cont != NULL) { beio->beio_cont(beio); } else { ctl_free_beio(beio); ctl_data_submit_done(io); } } static size_t cmp(uint8_t *a, uint8_t *b, size_t size) { size_t i; for (i = 0; i < size; i++) { if (a[i] != b[i]) break; } return (i); } static void ctl_be_block_compare(union ctl_io *io) { struct ctl_be_block_io *beio; uint64_t off, res; int i; uint8_t info[8]; beio = (struct ctl_be_block_io *)PRIV(io)->ptr; off = 0; for (i = 0; i < beio->num_segs; i++) { res = cmp(beio->sg_segs[i].addr, beio->sg_segs[i + CTLBLK_HALF_SEGS].addr, beio->sg_segs[i].len); off += res; if (res < beio->sg_segs[i].len) break; } if (i < beio->num_segs) { scsi_u64to8b(off, info); ctl_set_sense(&io->scsiio, /*current_error*/ 1, /*sense_key*/ SSD_KEY_MISCOMPARE, /*asc*/ 0x1D, /*ascq*/ 0x00, /*type*/ SSD_ELEM_INFO, /*size*/ sizeof(info), /*data*/ &info, /*type*/ SSD_ELEM_NONE); } else ctl_set_success(&io->scsiio); } static int ctl_be_block_move_done(union ctl_io *io) { struct ctl_be_block_io *beio; struct ctl_be_block_lun *be_lun; struct ctl_lba_len_flags *lbalen; #ifdef CTL_TIME_IO struct bintime cur_bt; #endif beio = (struct ctl_be_block_io *)PRIV(io)->ptr; be_lun = beio->lun; DPRINTF("entered\n"); #ifdef CTL_TIME_IO getbinuptime(&cur_bt); bintime_sub(&cur_bt, &io->io_hdr.dma_start_bt); bintime_add(&io->io_hdr.dma_bt, &cur_bt); #endif io->io_hdr.num_dmas++; io->scsiio.kern_rel_offset += io->scsiio.kern_data_len; /* * We set status at this point for read commands, and write * commands with errors. */ if (io->io_hdr.flags & CTL_FLAG_ABORT) { ; } else if ((io->io_hdr.port_status != 0) && ((io->io_hdr.status & CTL_STATUS_MASK) == CTL_STATUS_NONE || (io->io_hdr.status & CTL_STATUS_MASK) == CTL_SUCCESS)) { ctl_set_internal_failure(&io->scsiio, /*sks_valid*/ 1, /*retry_count*/ io->io_hdr.port_status); } else if (io->scsiio.kern_data_resid != 0 && (io->io_hdr.flags & CTL_FLAG_DATA_MASK) == CTL_FLAG_DATA_OUT && ((io->io_hdr.status & CTL_STATUS_MASK) == CTL_STATUS_NONE || (io->io_hdr.status & CTL_STATUS_MASK) == CTL_SUCCESS)) { ctl_set_invalid_field_ciu(&io->scsiio); } else if ((io->io_hdr.port_status == 0) && ((io->io_hdr.status & CTL_STATUS_MASK) == CTL_STATUS_NONE)) { lbalen = ARGS(beio->io); if (lbalen->flags & CTL_LLF_READ) { ctl_set_success(&io->scsiio); } else if (lbalen->flags & CTL_LLF_COMPARE) { /* We have two data blocks ready for comparison. */ ctl_be_block_compare(io); } } /* * If this is a read, or a write with errors, it is done. */ if ((beio->bio_cmd == BIO_READ) || ((io->io_hdr.flags & CTL_FLAG_ABORT) != 0) || ((io->io_hdr.status & CTL_STATUS_MASK) != CTL_STATUS_NONE)) { ctl_complete_beio(beio); return (0); } /* * At this point, we have a write and the DMA completed * successfully. We now have to queue it to the task queue to * execute the backend I/O. That is because we do blocking * memory allocations, and in the file backing case, blocking I/O. * This move done routine is generally called in the SIM's * interrupt context, and therefore we cannot block. */ mtx_lock(&be_lun->queue_lock); STAILQ_INSERT_TAIL(&be_lun->datamove_queue, &io->io_hdr, links); mtx_unlock(&be_lun->queue_lock); taskqueue_enqueue(be_lun->io_taskqueue, &be_lun->io_task); return (0); } static void ctl_be_block_biodone(struct bio *bio) { struct ctl_be_block_io *beio; struct ctl_be_block_lun *be_lun; union ctl_io *io; int error; beio = bio->bio_caller1; be_lun = beio->lun; io = beio->io; DPRINTF("entered\n"); error = bio->bio_error; mtx_lock(&be_lun->io_lock); if (error != 0 && (beio->first_error == 0 || bio->bio_offset < beio->first_error_offset)) { beio->first_error = error; beio->first_error_offset = bio->bio_offset; } beio->num_bios_done++; /* * XXX KDM will this cause WITNESS to complain? Holding a lock * during the free might cause it to complain. */ g_destroy_bio(bio); /* * If the send complete bit isn't set, or we aren't the last I/O to * complete, then we're done. */ if ((beio->send_complete == 0) || (beio->num_bios_done < beio->num_bios_sent)) { mtx_unlock(&be_lun->io_lock); return; } /* * At this point, we've verified that we are the last I/O to * complete, so it's safe to drop the lock. */ devstat_end_transaction(beio->lun->disk_stats, beio->io_len, beio->ds_tag_type, beio->ds_trans_type, /*now*/ NULL, /*then*/&beio->ds_t0); mtx_unlock(&be_lun->io_lock); /* * If there are any errors from the backing device, we fail the * entire I/O with a medium error. */ error = beio->first_error; if (error != 0) { if (error == EOPNOTSUPP) { ctl_set_invalid_opcode(&io->scsiio); } else if (error == ENOSPC || error == EDQUOT) { ctl_set_space_alloc_fail(&io->scsiio); } else if (error == EROFS || error == EACCES) { ctl_set_hw_write_protected(&io->scsiio); } else if (beio->bio_cmd == BIO_FLUSH) { /* XXX KDM is there is a better error here? */ ctl_set_internal_failure(&io->scsiio, /*sks_valid*/ 1, /*retry_count*/ 0xbad2); } else { ctl_set_medium_error(&io->scsiio, beio->bio_cmd == BIO_READ); } ctl_complete_beio(beio); return; } /* * If this is a write, a flush, a delete or verify, we're all done. * If this is a read, we can now send the data to the user. */ if ((beio->bio_cmd == BIO_WRITE) || (beio->bio_cmd == BIO_FLUSH) || (beio->bio_cmd == BIO_DELETE) || (ARGS(io)->flags & CTL_LLF_VERIFY)) { ctl_set_success(&io->scsiio); ctl_complete_beio(beio); } else { if ((ARGS(io)->flags & CTL_LLF_READ) && beio->beio_cont == NULL) { ctl_set_success(&io->scsiio); ctl_serseq_done(io); } #ifdef CTL_TIME_IO getbinuptime(&io->io_hdr.dma_start_bt); #endif ctl_datamove(io); } } static void ctl_be_block_flush_file(struct ctl_be_block_lun *be_lun, struct ctl_be_block_io *beio) { union ctl_io *io = beio->io; struct mount *mountpoint; int error, lock_flags; DPRINTF("entered\n"); binuptime(&beio->ds_t0); devstat_start_transaction(beio->lun->disk_stats, &beio->ds_t0); (void) vn_start_write(be_lun->vn, &mountpoint, V_WAIT); if (MNT_SHARED_WRITES(mountpoint) || ((mountpoint == NULL) && MNT_SHARED_WRITES(be_lun->vn->v_mount))) lock_flags = LK_SHARED; else lock_flags = LK_EXCLUSIVE; vn_lock(be_lun->vn, lock_flags | LK_RETRY); error = VOP_FSYNC(be_lun->vn, beio->io_arg ? MNT_NOWAIT : MNT_WAIT, curthread); VOP_UNLOCK(be_lun->vn); vn_finished_write(mountpoint); mtx_lock(&be_lun->io_lock); devstat_end_transaction(beio->lun->disk_stats, beio->io_len, beio->ds_tag_type, beio->ds_trans_type, /*now*/ NULL, /*then*/&beio->ds_t0); mtx_unlock(&be_lun->io_lock); if (error == 0) ctl_set_success(&io->scsiio); else { /* XXX KDM is there is a better error here? */ ctl_set_internal_failure(&io->scsiio, /*sks_valid*/ 1, /*retry_count*/ 0xbad1); } ctl_complete_beio(beio); } SDT_PROBE_DEFINE1(cbb, , read, file_start, "uint64_t"); SDT_PROBE_DEFINE1(cbb, , write, file_start, "uint64_t"); SDT_PROBE_DEFINE1(cbb, , read, file_done,"uint64_t"); SDT_PROBE_DEFINE1(cbb, , write, file_done, "uint64_t"); static void ctl_be_block_dispatch_file(struct ctl_be_block_lun *be_lun, struct ctl_be_block_io *beio) { struct ctl_be_block_filedata *file_data; union ctl_io *io; struct uio xuio; struct iovec *xiovec; size_t s; int error, flags, i; DPRINTF("entered\n"); file_data = &be_lun->backend.file; io = beio->io; flags = 0; if (ARGS(io)->flags & CTL_LLF_DPO) flags |= IO_DIRECT; if (beio->bio_cmd == BIO_WRITE && ARGS(io)->flags & CTL_LLF_FUA) flags |= IO_SYNC; bzero(&xuio, sizeof(xuio)); if (beio->bio_cmd == BIO_READ) { SDT_PROBE0(cbb, , read, file_start); xuio.uio_rw = UIO_READ; } else { SDT_PROBE0(cbb, , write, file_start); xuio.uio_rw = UIO_WRITE; } xuio.uio_offset = beio->io_offset; xuio.uio_resid = beio->io_len; xuio.uio_segflg = UIO_SYSSPACE; xuio.uio_iov = beio->xiovecs; xuio.uio_iovcnt = beio->num_segs; xuio.uio_td = curthread; for (i = 0, xiovec = xuio.uio_iov; i < xuio.uio_iovcnt; i++, xiovec++) { xiovec->iov_base = beio->sg_segs[i].addr; xiovec->iov_len = beio->sg_segs[i].len; } binuptime(&beio->ds_t0); devstat_start_transaction(beio->lun->disk_stats, &beio->ds_t0); if (beio->bio_cmd == BIO_READ) { vn_lock(be_lun->vn, LK_SHARED | LK_RETRY); /* * UFS pays attention to IO_DIRECT for reads. If the * DIRECTIO option is configured into the kernel, it calls * ffs_rawread(). But that only works for single-segment * uios with user space addresses. In our case, with a * kernel uio, it still reads into the buffer cache, but it * will just try to release the buffer from the cache later * on in ffs_read(). * * ZFS does not pay attention to IO_DIRECT for reads. * * UFS does not pay attention to IO_SYNC for reads. * * ZFS pays attention to IO_SYNC (which translates into the * Solaris define FRSYNC for zfs_read()) for reads. It * attempts to sync the file before reading. */ error = VOP_READ(be_lun->vn, &xuio, flags, file_data->cred); VOP_UNLOCK(be_lun->vn); SDT_PROBE0(cbb, , read, file_done); if (error == 0 && xuio.uio_resid > 0) { /* * If we red less then requested (EOF), then * we should clean the rest of the buffer. */ s = beio->io_len - xuio.uio_resid; for (i = 0; i < beio->num_segs; i++) { if (s >= beio->sg_segs[i].len) { s -= beio->sg_segs[i].len; continue; } bzero((uint8_t *)beio->sg_segs[i].addr + s, beio->sg_segs[i].len - s); s = 0; } } } else { struct mount *mountpoint; int lock_flags; (void)vn_start_write(be_lun->vn, &mountpoint, V_WAIT); if (MNT_SHARED_WRITES(mountpoint) || ((mountpoint == NULL) && MNT_SHARED_WRITES(be_lun->vn->v_mount))) lock_flags = LK_SHARED; else lock_flags = LK_EXCLUSIVE; vn_lock(be_lun->vn, lock_flags | LK_RETRY); /* * UFS pays attention to IO_DIRECT for writes. The write * is done asynchronously. (Normally the write would just * get put into cache. * * UFS pays attention to IO_SYNC for writes. It will * attempt to write the buffer out synchronously if that * flag is set. * * ZFS does not pay attention to IO_DIRECT for writes. * * ZFS pays attention to IO_SYNC (a.k.a. FSYNC or FRSYNC) * for writes. It will flush the transaction from the * cache before returning. */ error = VOP_WRITE(be_lun->vn, &xuio, flags, file_data->cred); VOP_UNLOCK(be_lun->vn); vn_finished_write(mountpoint); SDT_PROBE0(cbb, , write, file_done); } mtx_lock(&be_lun->io_lock); devstat_end_transaction(beio->lun->disk_stats, beio->io_len, beio->ds_tag_type, beio->ds_trans_type, /*now*/ NULL, /*then*/&beio->ds_t0); mtx_unlock(&be_lun->io_lock); /* * If we got an error, set the sense data to "MEDIUM ERROR" and * return the I/O to the user. */ if (error != 0) { if (error == ENOSPC || error == EDQUOT) { ctl_set_space_alloc_fail(&io->scsiio); } else if (error == EROFS || error == EACCES) { ctl_set_hw_write_protected(&io->scsiio); } else { ctl_set_medium_error(&io->scsiio, beio->bio_cmd == BIO_READ); } ctl_complete_beio(beio); return; } /* * If this is a write or a verify, we're all done. * If this is a read, we can now send the data to the user. */ if ((beio->bio_cmd == BIO_WRITE) || (ARGS(io)->flags & CTL_LLF_VERIFY)) { ctl_set_success(&io->scsiio); ctl_complete_beio(beio); } else { if ((ARGS(io)->flags & CTL_LLF_READ) && beio->beio_cont == NULL) { ctl_set_success(&io->scsiio); ctl_serseq_done(io); } #ifdef CTL_TIME_IO getbinuptime(&io->io_hdr.dma_start_bt); #endif ctl_datamove(io); } } static void ctl_be_block_gls_file(struct ctl_be_block_lun *be_lun, struct ctl_be_block_io *beio) { union ctl_io *io = beio->io; struct ctl_lba_len_flags *lbalen = ARGS(io); struct scsi_get_lba_status_data *data; off_t roff, off; int error, status; DPRINTF("entered\n"); off = roff = ((off_t)lbalen->lba) * be_lun->cbe_lun.blocksize; vn_lock(be_lun->vn, LK_SHARED | LK_RETRY); error = VOP_IOCTL(be_lun->vn, FIOSEEKHOLE, &off, 0, curthread->td_ucred, curthread); if (error == 0 && off > roff) status = 0; /* mapped up to off */ else { error = VOP_IOCTL(be_lun->vn, FIOSEEKDATA, &off, 0, curthread->td_ucred, curthread); if (error == 0 && off > roff) status = 1; /* deallocated up to off */ else { status = 0; /* unknown up to the end */ off = be_lun->size_bytes; } } VOP_UNLOCK(be_lun->vn); data = (struct scsi_get_lba_status_data *)io->scsiio.kern_data_ptr; scsi_u64to8b(lbalen->lba, data->descr[0].addr); scsi_ulto4b(MIN(UINT32_MAX, off / be_lun->cbe_lun.blocksize - lbalen->lba), data->descr[0].length); data->descr[0].status = status; ctl_complete_beio(beio); } static uint64_t ctl_be_block_getattr_file(struct ctl_be_block_lun *be_lun, const char *attrname) { struct vattr vattr; struct statfs statfs; uint64_t val; int error; val = UINT64_MAX; if (be_lun->vn == NULL) return (val); vn_lock(be_lun->vn, LK_SHARED | LK_RETRY); if (strcmp(attrname, "blocksused") == 0) { error = VOP_GETATTR(be_lun->vn, &vattr, curthread->td_ucred); if (error == 0) val = vattr.va_bytes / be_lun->cbe_lun.blocksize; } if (strcmp(attrname, "blocksavail") == 0 && !VN_IS_DOOMED(be_lun->vn)) { error = VFS_STATFS(be_lun->vn->v_mount, &statfs); if (error == 0) val = statfs.f_bavail * statfs.f_bsize / be_lun->cbe_lun.blocksize; } VOP_UNLOCK(be_lun->vn); return (val); } static void ctl_be_block_dispatch_zvol(struct ctl_be_block_lun *be_lun, struct ctl_be_block_io *beio) { union ctl_io *io; struct cdevsw *csw; struct cdev *dev; struct uio xuio; struct iovec *xiovec; int error, flags, i, ref; DPRINTF("entered\n"); io = beio->io; flags = 0; if (ARGS(io)->flags & CTL_LLF_DPO) flags |= IO_DIRECT; if (beio->bio_cmd == BIO_WRITE && ARGS(io)->flags & CTL_LLF_FUA) flags |= IO_SYNC; bzero(&xuio, sizeof(xuio)); if (beio->bio_cmd == BIO_READ) { SDT_PROBE0(cbb, , read, file_start); xuio.uio_rw = UIO_READ; } else { SDT_PROBE0(cbb, , write, file_start); xuio.uio_rw = UIO_WRITE; } xuio.uio_offset = beio->io_offset; xuio.uio_resid = beio->io_len; xuio.uio_segflg = UIO_SYSSPACE; xuio.uio_iov = beio->xiovecs; xuio.uio_iovcnt = beio->num_segs; xuio.uio_td = curthread; for (i = 0, xiovec = xuio.uio_iov; i < xuio.uio_iovcnt; i++, xiovec++) { xiovec->iov_base = beio->sg_segs[i].addr; xiovec->iov_len = beio->sg_segs[i].len; } binuptime(&beio->ds_t0); devstat_start_transaction(beio->lun->disk_stats, &beio->ds_t0); csw = devvn_refthread(be_lun->vn, &dev, &ref); if (csw) { if (beio->bio_cmd == BIO_READ) error = csw->d_read(dev, &xuio, flags); else error = csw->d_write(dev, &xuio, flags); dev_relthread(dev, ref); } else error = ENXIO; if (beio->bio_cmd == BIO_READ) SDT_PROBE0(cbb, , read, file_done); else SDT_PROBE0(cbb, , write, file_done); mtx_lock(&be_lun->io_lock); devstat_end_transaction(beio->lun->disk_stats, beio->io_len, beio->ds_tag_type, beio->ds_trans_type, /*now*/ NULL, /*then*/&beio->ds_t0); mtx_unlock(&be_lun->io_lock); /* * If we got an error, set the sense data to "MEDIUM ERROR" and * return the I/O to the user. */ if (error != 0) { if (error == ENOSPC || error == EDQUOT) { ctl_set_space_alloc_fail(&io->scsiio); } else if (error == EROFS || error == EACCES) { ctl_set_hw_write_protected(&io->scsiio); } else { ctl_set_medium_error(&io->scsiio, beio->bio_cmd == BIO_READ); } ctl_complete_beio(beio); return; } /* * If this is a write or a verify, we're all done. * If this is a read, we can now send the data to the user. */ if ((beio->bio_cmd == BIO_WRITE) || (ARGS(io)->flags & CTL_LLF_VERIFY)) { ctl_set_success(&io->scsiio); ctl_complete_beio(beio); } else { if ((ARGS(io)->flags & CTL_LLF_READ) && beio->beio_cont == NULL) { ctl_set_success(&io->scsiio); ctl_serseq_done(io); } #ifdef CTL_TIME_IO getbinuptime(&io->io_hdr.dma_start_bt); #endif ctl_datamove(io); } } static void ctl_be_block_gls_zvol(struct ctl_be_block_lun *be_lun, struct ctl_be_block_io *beio) { union ctl_io *io = beio->io; struct cdevsw *csw; struct cdev *dev; struct ctl_lba_len_flags *lbalen = ARGS(io); struct scsi_get_lba_status_data *data; off_t roff, off; int error, ref, status; DPRINTF("entered\n"); csw = devvn_refthread(be_lun->vn, &dev, &ref); if (csw == NULL) { status = 0; /* unknown up to the end */ off = be_lun->size_bytes; goto done; } off = roff = ((off_t)lbalen->lba) * be_lun->cbe_lun.blocksize; error = csw->d_ioctl(dev, FIOSEEKHOLE, (caddr_t)&off, FREAD, curthread); if (error == 0 && off > roff) status = 0; /* mapped up to off */ else { error = csw->d_ioctl(dev, FIOSEEKDATA, (caddr_t)&off, FREAD, curthread); if (error == 0 && off > roff) status = 1; /* deallocated up to off */ else { status = 0; /* unknown up to the end */ off = be_lun->size_bytes; } } dev_relthread(dev, ref); done: data = (struct scsi_get_lba_status_data *)io->scsiio.kern_data_ptr; scsi_u64to8b(lbalen->lba, data->descr[0].addr); scsi_ulto4b(MIN(UINT32_MAX, off / be_lun->cbe_lun.blocksize - lbalen->lba), data->descr[0].length); data->descr[0].status = status; ctl_complete_beio(beio); } static void ctl_be_block_flush_dev(struct ctl_be_block_lun *be_lun, struct ctl_be_block_io *beio) { struct bio *bio; struct cdevsw *csw; struct cdev *dev; int ref; DPRINTF("entered\n"); /* This can't fail, it's a blocking allocation. */ bio = g_alloc_bio(); bio->bio_cmd = BIO_FLUSH; bio->bio_offset = 0; bio->bio_data = 0; bio->bio_done = ctl_be_block_biodone; bio->bio_caller1 = beio; bio->bio_pblkno = 0; /* * We don't need to acquire the LUN lock here, because we are only * sending one bio, and so there is no other context to synchronize * with. */ beio->num_bios_sent = 1; beio->send_complete = 1; binuptime(&beio->ds_t0); devstat_start_transaction(be_lun->disk_stats, &beio->ds_t0); csw = devvn_refthread(be_lun->vn, &dev, &ref); if (csw) { bio->bio_dev = dev; csw->d_strategy(bio); dev_relthread(dev, ref); } else { bio->bio_error = ENXIO; ctl_be_block_biodone(bio); } } static void ctl_be_block_unmap_dev_range(struct ctl_be_block_lun *be_lun, struct ctl_be_block_io *beio, uint64_t off, uint64_t len, int last) { struct bio *bio; uint64_t maxlen; struct cdevsw *csw; struct cdev *dev; int ref; csw = devvn_refthread(be_lun->vn, &dev, &ref); maxlen = LONG_MAX - (LONG_MAX % be_lun->cbe_lun.blocksize); while (len > 0) { bio = g_alloc_bio(); bio->bio_cmd = BIO_DELETE; bio->bio_dev = dev; bio->bio_offset = off; bio->bio_length = MIN(len, maxlen); bio->bio_data = 0; bio->bio_done = ctl_be_block_biodone; bio->bio_caller1 = beio; bio->bio_pblkno = off / be_lun->cbe_lun.blocksize; off += bio->bio_length; len -= bio->bio_length; mtx_lock(&be_lun->io_lock); beio->num_bios_sent++; if (last && len == 0) beio->send_complete = 1; mtx_unlock(&be_lun->io_lock); if (csw) { csw->d_strategy(bio); } else { bio->bio_error = ENXIO; ctl_be_block_biodone(bio); } } if (csw) dev_relthread(dev, ref); } static void ctl_be_block_unmap_dev(struct ctl_be_block_lun *be_lun, struct ctl_be_block_io *beio) { union ctl_io *io; struct ctl_ptr_len_flags *ptrlen; struct scsi_unmap_desc *buf, *end; uint64_t len; io = beio->io; DPRINTF("entered\n"); binuptime(&beio->ds_t0); devstat_start_transaction(be_lun->disk_stats, &beio->ds_t0); if (beio->io_offset == -1) { beio->io_len = 0; ptrlen = (struct ctl_ptr_len_flags *)&io->io_hdr.ctl_private[CTL_PRIV_LBA_LEN]; buf = (struct scsi_unmap_desc *)ptrlen->ptr; end = buf + ptrlen->len / sizeof(*buf); for (; buf < end; buf++) { len = (uint64_t)scsi_4btoul(buf->length) * be_lun->cbe_lun.blocksize; beio->io_len += len; ctl_be_block_unmap_dev_range(be_lun, beio, scsi_8btou64(buf->lba) * be_lun->cbe_lun.blocksize, len, (end - buf < 2) ? TRUE : FALSE); } } else ctl_be_block_unmap_dev_range(be_lun, beio, beio->io_offset, beio->io_len, TRUE); } static void ctl_be_block_dispatch_dev(struct ctl_be_block_lun *be_lun, struct ctl_be_block_io *beio) { TAILQ_HEAD(, bio) queue = TAILQ_HEAD_INITIALIZER(queue); struct bio *bio; struct cdevsw *csw; struct cdev *dev; off_t cur_offset; int i, max_iosize, ref; DPRINTF("entered\n"); csw = devvn_refthread(be_lun->vn, &dev, &ref); /* * We have to limit our I/O size to the maximum supported by the * backend device. Hopefully it is MAXPHYS. If the driver doesn't * set it properly, use DFLTPHYS. */ if (csw) { max_iosize = dev->si_iosize_max; if (max_iosize < PAGE_SIZE) max_iosize = DFLTPHYS; } else max_iosize = DFLTPHYS; cur_offset = beio->io_offset; for (i = 0; i < beio->num_segs; i++) { size_t cur_size; uint8_t *cur_ptr; cur_size = beio->sg_segs[i].len; cur_ptr = beio->sg_segs[i].addr; while (cur_size > 0) { /* This can't fail, it's a blocking allocation. */ bio = g_alloc_bio(); KASSERT(bio != NULL, ("g_alloc_bio() failed!\n")); bio->bio_cmd = beio->bio_cmd; bio->bio_dev = dev; bio->bio_caller1 = beio; bio->bio_length = min(cur_size, max_iosize); bio->bio_offset = cur_offset; bio->bio_data = cur_ptr; bio->bio_done = ctl_be_block_biodone; bio->bio_pblkno = cur_offset / be_lun->cbe_lun.blocksize; cur_offset += bio->bio_length; cur_ptr += bio->bio_length; cur_size -= bio->bio_length; TAILQ_INSERT_TAIL(&queue, bio, bio_queue); beio->num_bios_sent++; } } beio->send_complete = 1; binuptime(&beio->ds_t0); devstat_start_transaction(be_lun->disk_stats, &beio->ds_t0); /* * Fire off all allocated requests! */ while ((bio = TAILQ_FIRST(&queue)) != NULL) { TAILQ_REMOVE(&queue, bio, bio_queue); if (csw) csw->d_strategy(bio); else { bio->bio_error = ENXIO; ctl_be_block_biodone(bio); } } if (csw) dev_relthread(dev, ref); } static uint64_t ctl_be_block_getattr_dev(struct ctl_be_block_lun *be_lun, const char *attrname) { struct diocgattr_arg arg; struct cdevsw *csw; struct cdev *dev; int error, ref; csw = devvn_refthread(be_lun->vn, &dev, &ref); if (csw == NULL) return (UINT64_MAX); strlcpy(arg.name, attrname, sizeof(arg.name)); arg.len = sizeof(arg.value.off); if (csw->d_ioctl) { error = csw->d_ioctl(dev, DIOCGATTR, (caddr_t)&arg, FREAD, curthread); } else error = ENODEV; dev_relthread(dev, ref); if (error != 0) return (UINT64_MAX); return (arg.value.off); } static void ctl_be_block_cw_dispatch_sync(struct ctl_be_block_lun *be_lun, union ctl_io *io) { struct ctl_be_lun *cbe_lun = &be_lun->cbe_lun; struct ctl_be_block_io *beio; struct ctl_lba_len_flags *lbalen; DPRINTF("entered\n"); beio = (struct ctl_be_block_io *)PRIV(io)->ptr; lbalen = (struct ctl_lba_len_flags *)&io->io_hdr.ctl_private[CTL_PRIV_LBA_LEN]; beio->io_len = lbalen->len * cbe_lun->blocksize; beio->io_offset = lbalen->lba * cbe_lun->blocksize; beio->io_arg = (lbalen->flags & SSC_IMMED) != 0; beio->bio_cmd = BIO_FLUSH; beio->ds_trans_type = DEVSTAT_NO_DATA; DPRINTF("SYNC\n"); be_lun->lun_flush(be_lun, beio); } static void ctl_be_block_cw_done_ws(struct ctl_be_block_io *beio) { union ctl_io *io; io = beio->io; ctl_free_beio(beio); if ((io->io_hdr.flags & CTL_FLAG_ABORT) || ((io->io_hdr.status & CTL_STATUS_MASK) != CTL_STATUS_NONE && (io->io_hdr.status & CTL_STATUS_MASK) != CTL_SUCCESS)) { ctl_config_write_done(io); return; } ctl_be_block_config_write(io); } static void ctl_be_block_cw_dispatch_ws(struct ctl_be_block_lun *be_lun, union ctl_io *io) { struct ctl_be_block_softc *softc = be_lun->softc; struct ctl_be_lun *cbe_lun = &be_lun->cbe_lun; struct ctl_be_block_io *beio; struct ctl_lba_len_flags *lbalen; uint64_t len_left, lba; uint32_t pb, pbo, adj; int i, seglen; uint8_t *buf, *end; DPRINTF("entered\n"); beio = (struct ctl_be_block_io *)PRIV(io)->ptr; lbalen = ARGS(beio->io); if (lbalen->flags & ~(SWS_LBDATA | SWS_UNMAP | SWS_ANCHOR | SWS_NDOB) || (lbalen->flags & (SWS_UNMAP | SWS_ANCHOR) && be_lun->unmap == NULL)) { ctl_free_beio(beio); ctl_set_invalid_field(&io->scsiio, /*sks_valid*/ 1, /*command*/ 1, /*field*/ 1, /*bit_valid*/ 0, /*bit*/ 0); ctl_config_write_done(io); return; } if (lbalen->flags & (SWS_UNMAP | SWS_ANCHOR)) { beio->io_offset = lbalen->lba * cbe_lun->blocksize; beio->io_len = (uint64_t)lbalen->len * cbe_lun->blocksize; beio->bio_cmd = BIO_DELETE; beio->ds_trans_type = DEVSTAT_FREE; be_lun->unmap(be_lun, beio); return; } beio->bio_cmd = BIO_WRITE; beio->ds_trans_type = DEVSTAT_WRITE; DPRINTF("WRITE SAME at LBA %jx len %u\n", (uintmax_t)lbalen->lba, lbalen->len); pb = cbe_lun->blocksize << be_lun->cbe_lun.pblockexp; if (be_lun->cbe_lun.pblockoff > 0) pbo = pb - cbe_lun->blocksize * be_lun->cbe_lun.pblockoff; else pbo = 0; len_left = (uint64_t)lbalen->len * cbe_lun->blocksize; for (i = 0, lba = 0; i < CTLBLK_MAX_SEGS && len_left > 0; i++) { /* * Setup the S/G entry for this chunk. */ seglen = MIN(CTLBLK_MAX_SEG, len_left); if (pb > cbe_lun->blocksize) { adj = ((lbalen->lba + lba) * cbe_lun->blocksize + seglen - pbo) % pb; if (seglen > adj) seglen -= adj; else seglen -= seglen % cbe_lun->blocksize; } else seglen -= seglen % cbe_lun->blocksize; beio->sg_segs[i].len = seglen; beio->sg_segs[i].addr = uma_zalloc(softc->buf_zone, M_WAITOK); DPRINTF("segment %d addr %p len %zd\n", i, beio->sg_segs[i].addr, beio->sg_segs[i].len); beio->num_segs++; len_left -= seglen; buf = beio->sg_segs[i].addr; end = buf + seglen; for (; buf < end; buf += cbe_lun->blocksize) { if (lbalen->flags & SWS_NDOB) { memset(buf, 0, cbe_lun->blocksize); } else { memcpy(buf, io->scsiio.kern_data_ptr, cbe_lun->blocksize); } if (lbalen->flags & SWS_LBDATA) scsi_ulto4b(lbalen->lba + lba, buf); lba++; } } beio->io_offset = lbalen->lba * cbe_lun->blocksize; beio->io_len = lba * cbe_lun->blocksize; /* We can not do all in one run. Correct and schedule rerun. */ if (len_left > 0) { lbalen->lba += lba; lbalen->len -= lba; beio->beio_cont = ctl_be_block_cw_done_ws; } be_lun->dispatch(be_lun, beio); } static void ctl_be_block_cw_dispatch_unmap(struct ctl_be_block_lun *be_lun, union ctl_io *io) { struct ctl_be_block_io *beio; struct ctl_ptr_len_flags *ptrlen; DPRINTF("entered\n"); beio = (struct ctl_be_block_io *)PRIV(io)->ptr; ptrlen = (struct ctl_ptr_len_flags *)&io->io_hdr.ctl_private[CTL_PRIV_LBA_LEN]; if ((ptrlen->flags & ~SU_ANCHOR) != 0 || be_lun->unmap == NULL) { ctl_free_beio(beio); ctl_set_invalid_field(&io->scsiio, /*sks_valid*/ 0, /*command*/ 1, /*field*/ 0, /*bit_valid*/ 0, /*bit*/ 0); ctl_config_write_done(io); return; } beio->io_len = 0; beio->io_offset = -1; beio->bio_cmd = BIO_DELETE; beio->ds_trans_type = DEVSTAT_FREE; DPRINTF("UNMAP\n"); be_lun->unmap(be_lun, beio); } static void ctl_be_block_cr_done(struct ctl_be_block_io *beio) { union ctl_io *io; io = beio->io; ctl_free_beio(beio); ctl_config_read_done(io); } static void ctl_be_block_cr_dispatch(struct ctl_be_block_lun *be_lun, union ctl_io *io) { struct ctl_be_block_io *beio; struct ctl_be_block_softc *softc; DPRINTF("entered\n"); softc = be_lun->softc; beio = ctl_alloc_beio(softc); beio->io = io; beio->lun = be_lun; beio->beio_cont = ctl_be_block_cr_done; PRIV(io)->ptr = (void *)beio; switch (io->scsiio.cdb[0]) { case SERVICE_ACTION_IN: /* GET LBA STATUS */ beio->bio_cmd = -1; beio->ds_trans_type = DEVSTAT_NO_DATA; beio->ds_tag_type = DEVSTAT_TAG_ORDERED; beio->io_len = 0; if (be_lun->get_lba_status) be_lun->get_lba_status(be_lun, beio); else ctl_be_block_cr_done(beio); break; default: panic("Unhandled CDB type %#x", io->scsiio.cdb[0]); break; } } static void ctl_be_block_cw_done(struct ctl_be_block_io *beio) { union ctl_io *io; io = beio->io; ctl_free_beio(beio); ctl_config_write_done(io); } static void ctl_be_block_cw_dispatch(struct ctl_be_block_lun *be_lun, union ctl_io *io) { struct ctl_be_block_io *beio; struct ctl_be_block_softc *softc; DPRINTF("entered\n"); softc = be_lun->softc; beio = ctl_alloc_beio(softc); beio->io = io; beio->lun = be_lun; beio->beio_cont = ctl_be_block_cw_done; switch (io->scsiio.tag_type) { case CTL_TAG_ORDERED: beio->ds_tag_type = DEVSTAT_TAG_ORDERED; break; case CTL_TAG_HEAD_OF_QUEUE: beio->ds_tag_type = DEVSTAT_TAG_HEAD; break; case CTL_TAG_UNTAGGED: case CTL_TAG_SIMPLE: case CTL_TAG_ACA: default: beio->ds_tag_type = DEVSTAT_TAG_SIMPLE; break; } PRIV(io)->ptr = (void *)beio; switch (io->scsiio.cdb[0]) { case SYNCHRONIZE_CACHE: case SYNCHRONIZE_CACHE_16: ctl_be_block_cw_dispatch_sync(be_lun, io); break; case WRITE_SAME_10: case WRITE_SAME_16: ctl_be_block_cw_dispatch_ws(be_lun, io); break; case UNMAP: ctl_be_block_cw_dispatch_unmap(be_lun, io); break; default: panic("Unhandled CDB type %#x", io->scsiio.cdb[0]); break; } } SDT_PROBE_DEFINE1(cbb, , read, start, "uint64_t"); SDT_PROBE_DEFINE1(cbb, , write, start, "uint64_t"); SDT_PROBE_DEFINE1(cbb, , read, alloc_done, "uint64_t"); SDT_PROBE_DEFINE1(cbb, , write, alloc_done, "uint64_t"); static void ctl_be_block_next(struct ctl_be_block_io *beio) { struct ctl_be_block_lun *be_lun; union ctl_io *io; io = beio->io; be_lun = beio->lun; ctl_free_beio(beio); if ((io->io_hdr.flags & CTL_FLAG_ABORT) || ((io->io_hdr.status & CTL_STATUS_MASK) != CTL_STATUS_NONE && (io->io_hdr.status & CTL_STATUS_MASK) != CTL_SUCCESS)) { ctl_data_submit_done(io); return; } io->io_hdr.status &= ~CTL_STATUS_MASK; io->io_hdr.status |= CTL_STATUS_NONE; mtx_lock(&be_lun->queue_lock); STAILQ_INSERT_TAIL(&be_lun->input_queue, &io->io_hdr, links); mtx_unlock(&be_lun->queue_lock); taskqueue_enqueue(be_lun->io_taskqueue, &be_lun->io_task); } static void ctl_be_block_dispatch(struct ctl_be_block_lun *be_lun, union ctl_io *io) { struct ctl_be_lun *cbe_lun = &be_lun->cbe_lun; struct ctl_be_block_io *beio; struct ctl_be_block_softc *softc; struct ctl_lba_len_flags *lbalen; struct ctl_ptr_len_flags *bptrlen; uint64_t len_left, lbas; int i; softc = be_lun->softc; DPRINTF("entered\n"); lbalen = ARGS(io); if (lbalen->flags & CTL_LLF_WRITE) { SDT_PROBE0(cbb, , write, start); } else { SDT_PROBE0(cbb, , read, start); } beio = ctl_alloc_beio(softc); beio->io = io; beio->lun = be_lun; bptrlen = PRIV(io); bptrlen->ptr = (void *)beio; switch (io->scsiio.tag_type) { case CTL_TAG_ORDERED: beio->ds_tag_type = DEVSTAT_TAG_ORDERED; break; case CTL_TAG_HEAD_OF_QUEUE: beio->ds_tag_type = DEVSTAT_TAG_HEAD; break; case CTL_TAG_UNTAGGED: case CTL_TAG_SIMPLE: case CTL_TAG_ACA: default: beio->ds_tag_type = DEVSTAT_TAG_SIMPLE; break; } if (lbalen->flags & CTL_LLF_WRITE) { beio->bio_cmd = BIO_WRITE; beio->ds_trans_type = DEVSTAT_WRITE; } else { beio->bio_cmd = BIO_READ; beio->ds_trans_type = DEVSTAT_READ; } DPRINTF("%s at LBA %jx len %u @%ju\n", (beio->bio_cmd == BIO_READ) ? "READ" : "WRITE", (uintmax_t)lbalen->lba, lbalen->len, bptrlen->len); if (lbalen->flags & CTL_LLF_COMPARE) { beio->two_sglists = 1; lbas = CTLBLK_HALF_IO_SIZE; } else { lbas = CTLBLK_MAX_IO_SIZE; } lbas = MIN(lbalen->len - bptrlen->len, lbas / cbe_lun->blocksize); beio->io_offset = (lbalen->lba + bptrlen->len) * cbe_lun->blocksize; beio->io_len = lbas * cbe_lun->blocksize; bptrlen->len += lbas; for (i = 0, len_left = beio->io_len; len_left > 0; i++) { KASSERT(i < CTLBLK_MAX_SEGS, ("Too many segs (%d >= %d)", i, CTLBLK_MAX_SEGS)); /* * Setup the S/G entry for this chunk. */ beio->sg_segs[i].len = min(CTLBLK_MAX_SEG, len_left); beio->sg_segs[i].addr = uma_zalloc(softc->buf_zone, M_WAITOK); DPRINTF("segment %d addr %p len %zd\n", i, beio->sg_segs[i].addr, beio->sg_segs[i].len); /* Set up second segment for compare operation. */ if (beio->two_sglists) { beio->sg_segs[i + CTLBLK_HALF_SEGS].len = beio->sg_segs[i].len; beio->sg_segs[i + CTLBLK_HALF_SEGS].addr = uma_zalloc(softc->buf_zone, M_WAITOK); } beio->num_segs++; len_left -= beio->sg_segs[i].len; } if (bptrlen->len < lbalen->len) beio->beio_cont = ctl_be_block_next; io->scsiio.be_move_done = ctl_be_block_move_done; /* For compare we have separate S/G lists for read and datamove. */ if (beio->two_sglists) io->scsiio.kern_data_ptr = (uint8_t *)&beio->sg_segs[CTLBLK_HALF_SEGS]; else io->scsiio.kern_data_ptr = (uint8_t *)beio->sg_segs; io->scsiio.kern_data_len = beio->io_len; io->scsiio.kern_sg_entries = beio->num_segs; io->scsiio.kern_data_ref = ctl_refcnt_beio; io->scsiio.kern_data_arg = beio; io->io_hdr.flags |= CTL_FLAG_ALLOCATED; /* * For the read case, we need to read the data into our buffers and * then we can send it back to the user. For the write case, we * need to get the data from the user first. */ if (beio->bio_cmd == BIO_READ) { SDT_PROBE0(cbb, , read, alloc_done); be_lun->dispatch(be_lun, beio); } else { SDT_PROBE0(cbb, , write, alloc_done); #ifdef CTL_TIME_IO getbinuptime(&io->io_hdr.dma_start_bt); #endif ctl_datamove(io); } } static void ctl_be_block_worker(void *context, int pending) { struct ctl_be_block_lun *be_lun = (struct ctl_be_block_lun *)context; struct ctl_be_lun *cbe_lun = &be_lun->cbe_lun; union ctl_io *io; struct ctl_be_block_io *beio; DPRINTF("entered\n"); /* * Fetch and process I/Os from all queues. If we detect LUN * CTL_LUN_FLAG_NO_MEDIA status here -- it is result of a race, * so make response maximally opaque to not confuse initiator. */ for (;;) { mtx_lock(&be_lun->queue_lock); io = (union ctl_io *)STAILQ_FIRST(&be_lun->datamove_queue); if (io != NULL) { DPRINTF("datamove queue\n"); STAILQ_REMOVE(&be_lun->datamove_queue, &io->io_hdr, ctl_io_hdr, links); mtx_unlock(&be_lun->queue_lock); beio = (struct ctl_be_block_io *)PRIV(io)->ptr; if (cbe_lun->flags & CTL_LUN_FLAG_NO_MEDIA) { ctl_set_busy(&io->scsiio); ctl_complete_beio(beio); return; } be_lun->dispatch(be_lun, beio); continue; } io = (union ctl_io *)STAILQ_FIRST(&be_lun->config_write_queue); if (io != NULL) { DPRINTF("config write queue\n"); STAILQ_REMOVE(&be_lun->config_write_queue, &io->io_hdr, ctl_io_hdr, links); mtx_unlock(&be_lun->queue_lock); if (cbe_lun->flags & CTL_LUN_FLAG_NO_MEDIA) { ctl_set_busy(&io->scsiio); ctl_config_write_done(io); return; } ctl_be_block_cw_dispatch(be_lun, io); continue; } io = (union ctl_io *)STAILQ_FIRST(&be_lun->config_read_queue); if (io != NULL) { DPRINTF("config read queue\n"); STAILQ_REMOVE(&be_lun->config_read_queue, &io->io_hdr, ctl_io_hdr, links); mtx_unlock(&be_lun->queue_lock); if (cbe_lun->flags & CTL_LUN_FLAG_NO_MEDIA) { ctl_set_busy(&io->scsiio); ctl_config_read_done(io); return; } ctl_be_block_cr_dispatch(be_lun, io); continue; } io = (union ctl_io *)STAILQ_FIRST(&be_lun->input_queue); if (io != NULL) { DPRINTF("input queue\n"); STAILQ_REMOVE(&be_lun->input_queue, &io->io_hdr, ctl_io_hdr, links); mtx_unlock(&be_lun->queue_lock); if (cbe_lun->flags & CTL_LUN_FLAG_NO_MEDIA) { ctl_set_busy(&io->scsiio); ctl_data_submit_done(io); return; } ctl_be_block_dispatch(be_lun, io); continue; } /* * If we get here, there is no work left in the queues, so * just break out and let the task queue go to sleep. */ mtx_unlock(&be_lun->queue_lock); break; } } /* * Entry point from CTL to the backend for I/O. We queue everything to a * work thread, so this just puts the I/O on a queue and wakes up the * thread. */ static int ctl_be_block_submit(union ctl_io *io) { struct ctl_be_block_lun *be_lun; DPRINTF("entered\n"); be_lun = (struct ctl_be_block_lun *)CTL_BACKEND_LUN(io); /* * Make sure we only get SCSI I/O. */ KASSERT(io->io_hdr.io_type == CTL_IO_SCSI, ("Non-SCSI I/O (type " "%#x) encountered", io->io_hdr.io_type)); PRIV(io)->len = 0; mtx_lock(&be_lun->queue_lock); STAILQ_INSERT_TAIL(&be_lun->input_queue, &io->io_hdr, links); mtx_unlock(&be_lun->queue_lock); taskqueue_enqueue(be_lun->io_taskqueue, &be_lun->io_task); return (CTL_RETVAL_COMPLETE); } static int ctl_be_block_ioctl(struct cdev *dev, u_long cmd, caddr_t addr, int flag, struct thread *td) { struct ctl_be_block_softc *softc = &backend_block_softc; int error; error = 0; switch (cmd) { case CTL_LUN_REQ: { struct ctl_lun_req *lun_req; lun_req = (struct ctl_lun_req *)addr; switch (lun_req->reqtype) { case CTL_LUNREQ_CREATE: error = ctl_be_block_create(softc, lun_req); break; case CTL_LUNREQ_RM: error = ctl_be_block_rm(softc, lun_req); break; case CTL_LUNREQ_MODIFY: error = ctl_be_block_modify(softc, lun_req); break; default: lun_req->status = CTL_LUN_ERROR; snprintf(lun_req->error_str, sizeof(lun_req->error_str), "invalid LUN request type %d", lun_req->reqtype); break; } break; } default: error = ENOTTY; break; } return (error); } static int ctl_be_block_open_file(struct ctl_be_block_lun *be_lun, struct ctl_lun_req *req) { struct ctl_be_lun *cbe_lun; struct ctl_be_block_filedata *file_data; struct ctl_lun_create_params *params; const char *value; struct vattr vattr; off_t ps, pss, po, pos, us, uss, uo, uos; int error; cbe_lun = &be_lun->cbe_lun; file_data = &be_lun->backend.file; params = &be_lun->params; be_lun->dev_type = CTL_BE_BLOCK_FILE; be_lun->dispatch = ctl_be_block_dispatch_file; be_lun->lun_flush = ctl_be_block_flush_file; be_lun->get_lba_status = ctl_be_block_gls_file; be_lun->getattr = ctl_be_block_getattr_file; be_lun->unmap = NULL; cbe_lun->flags &= ~CTL_LUN_FLAG_UNMAP; error = VOP_GETATTR(be_lun->vn, &vattr, curthread->td_ucred); if (error != 0) { snprintf(req->error_str, sizeof(req->error_str), "error calling VOP_GETATTR() for file %s", be_lun->dev_path); return (error); } file_data->cred = crhold(curthread->td_ucred); if (params->lun_size_bytes != 0) be_lun->size_bytes = params->lun_size_bytes; else be_lun->size_bytes = vattr.va_size; /* * For files we can use any logical block size. Prefer 512 bytes * for compatibility reasons. If file's vattr.va_blocksize * (preferred I/O block size) is bigger and multiple to chosen * logical block size -- report it as physical block size. */ if (params->blocksize_bytes != 0) cbe_lun->blocksize = params->blocksize_bytes; else if (cbe_lun->lun_type == T_CDROM) cbe_lun->blocksize = 2048; else cbe_lun->blocksize = 512; be_lun->size_blocks = be_lun->size_bytes / cbe_lun->blocksize; cbe_lun->maxlba = (be_lun->size_blocks == 0) ? 0 : (be_lun->size_blocks - 1); us = ps = vattr.va_blocksize; uo = po = 0; value = dnvlist_get_string(cbe_lun->options, "pblocksize", NULL); if (value != NULL) ctl_expand_number(value, &ps); value = dnvlist_get_string(cbe_lun->options, "pblockoffset", NULL); if (value != NULL) ctl_expand_number(value, &po); pss = ps / cbe_lun->blocksize; pos = po / cbe_lun->blocksize; if ((pss > 0) && (pss * cbe_lun->blocksize == ps) && (pss >= pos) && ((pss & (pss - 1)) == 0) && (pos * cbe_lun->blocksize == po)) { cbe_lun->pblockexp = fls(pss) - 1; cbe_lun->pblockoff = (pss - pos) % pss; } value = dnvlist_get_string(cbe_lun->options, "ublocksize", NULL); if (value != NULL) ctl_expand_number(value, &us); value = dnvlist_get_string(cbe_lun->options, "ublockoffset", NULL); if (value != NULL) ctl_expand_number(value, &uo); uss = us / cbe_lun->blocksize; uos = uo / cbe_lun->blocksize; if ((uss > 0) && (uss * cbe_lun->blocksize == us) && (uss >= uos) && ((uss & (uss - 1)) == 0) && (uos * cbe_lun->blocksize == uo)) { cbe_lun->ublockexp = fls(uss) - 1; cbe_lun->ublockoff = (uss - uos) % uss; } /* * Sanity check. The media size has to be at least one * sector long. */ if (be_lun->size_bytes < cbe_lun->blocksize) { error = EINVAL; snprintf(req->error_str, sizeof(req->error_str), "file %s size %ju < block size %u", be_lun->dev_path, (uintmax_t)be_lun->size_bytes, cbe_lun->blocksize); } cbe_lun->opttxferlen = CTLBLK_MAX_IO_SIZE / cbe_lun->blocksize; return (error); } static int ctl_be_block_open_dev(struct ctl_be_block_lun *be_lun, struct ctl_lun_req *req) { struct ctl_be_lun *cbe_lun = &be_lun->cbe_lun; struct ctl_lun_create_params *params; struct cdevsw *csw; struct cdev *dev; const char *value; int error, atomic, maxio, ref, unmap, tmp; off_t ps, pss, po, pos, us, uss, uo, uos, otmp; params = &be_lun->params; be_lun->dev_type = CTL_BE_BLOCK_DEV; csw = devvn_refthread(be_lun->vn, &dev, &ref); if (csw == NULL) return (ENXIO); if (strcmp(csw->d_name, "zvol") == 0) { be_lun->dispatch = ctl_be_block_dispatch_zvol; be_lun->get_lba_status = ctl_be_block_gls_zvol; atomic = maxio = CTLBLK_MAX_IO_SIZE; } else { be_lun->dispatch = ctl_be_block_dispatch_dev; be_lun->get_lba_status = NULL; atomic = 0; maxio = dev->si_iosize_max; if (maxio <= 0) maxio = DFLTPHYS; if (maxio > CTLBLK_MAX_IO_SIZE) maxio = CTLBLK_MAX_IO_SIZE; } be_lun->lun_flush = ctl_be_block_flush_dev; be_lun->getattr = ctl_be_block_getattr_dev; be_lun->unmap = ctl_be_block_unmap_dev; if (!csw->d_ioctl) { dev_relthread(dev, ref); snprintf(req->error_str, sizeof(req->error_str), "no d_ioctl for device %s!", be_lun->dev_path); return (ENODEV); } error = csw->d_ioctl(dev, DIOCGSECTORSIZE, (caddr_t)&tmp, FREAD, curthread); if (error) { dev_relthread(dev, ref); snprintf(req->error_str, sizeof(req->error_str), "error %d returned for DIOCGSECTORSIZE ioctl " "on %s!", error, be_lun->dev_path); return (error); } /* * If the user has asked for a blocksize that is greater than the * backing device's blocksize, we can do it only if the blocksize * the user is asking for is an even multiple of the underlying * device's blocksize. */ if ((params->blocksize_bytes != 0) && (params->blocksize_bytes >= tmp)) { if (params->blocksize_bytes % tmp == 0) { cbe_lun->blocksize = params->blocksize_bytes; } else { dev_relthread(dev, ref); snprintf(req->error_str, sizeof(req->error_str), "requested blocksize %u is not an even " "multiple of backing device blocksize %u", params->blocksize_bytes, tmp); return (EINVAL); } } else if (params->blocksize_bytes != 0) { dev_relthread(dev, ref); snprintf(req->error_str, sizeof(req->error_str), "requested blocksize %u < backing device " "blocksize %u", params->blocksize_bytes, tmp); return (EINVAL); } else if (cbe_lun->lun_type == T_CDROM) cbe_lun->blocksize = MAX(tmp, 2048); else cbe_lun->blocksize = tmp; error = csw->d_ioctl(dev, DIOCGMEDIASIZE, (caddr_t)&otmp, FREAD, curthread); if (error) { dev_relthread(dev, ref); snprintf(req->error_str, sizeof(req->error_str), "error %d returned for DIOCGMEDIASIZE " " ioctl on %s!", error, be_lun->dev_path); return (error); } if (params->lun_size_bytes != 0) { if (params->lun_size_bytes > otmp) { dev_relthread(dev, ref); snprintf(req->error_str, sizeof(req->error_str), "requested LUN size %ju > backing device " "size %ju", (uintmax_t)params->lun_size_bytes, (uintmax_t)otmp); return (EINVAL); } be_lun->size_bytes = params->lun_size_bytes; } else be_lun->size_bytes = otmp; be_lun->size_blocks = be_lun->size_bytes / cbe_lun->blocksize; cbe_lun->maxlba = (be_lun->size_blocks == 0) ? 0 : (be_lun->size_blocks - 1); error = csw->d_ioctl(dev, DIOCGSTRIPESIZE, (caddr_t)&ps, FREAD, curthread); if (error) ps = po = 0; else { error = csw->d_ioctl(dev, DIOCGSTRIPEOFFSET, (caddr_t)&po, FREAD, curthread); if (error) po = 0; } us = ps; uo = po; value = dnvlist_get_string(cbe_lun->options, "pblocksize", NULL); if (value != NULL) ctl_expand_number(value, &ps); value = dnvlist_get_string(cbe_lun->options, "pblockoffset", NULL); if (value != NULL) ctl_expand_number(value, &po); pss = ps / cbe_lun->blocksize; pos = po / cbe_lun->blocksize; if ((pss > 0) && (pss * cbe_lun->blocksize == ps) && (pss >= pos) && ((pss & (pss - 1)) == 0) && (pos * cbe_lun->blocksize == po)) { cbe_lun->pblockexp = fls(pss) - 1; cbe_lun->pblockoff = (pss - pos) % pss; } value = dnvlist_get_string(cbe_lun->options, "ublocksize", NULL); if (value != NULL) ctl_expand_number(value, &us); value = dnvlist_get_string(cbe_lun->options, "ublockoffset", NULL); if (value != NULL) ctl_expand_number(value, &uo); uss = us / cbe_lun->blocksize; uos = uo / cbe_lun->blocksize; if ((uss > 0) && (uss * cbe_lun->blocksize == us) && (uss >= uos) && ((uss & (uss - 1)) == 0) && (uos * cbe_lun->blocksize == uo)) { cbe_lun->ublockexp = fls(uss) - 1; cbe_lun->ublockoff = (uss - uos) % uss; } cbe_lun->atomicblock = atomic / cbe_lun->blocksize; cbe_lun->opttxferlen = maxio / cbe_lun->blocksize; if (be_lun->dispatch == ctl_be_block_dispatch_zvol) { unmap = 1; } else { struct diocgattr_arg arg; strlcpy(arg.name, "GEOM::candelete", sizeof(arg.name)); arg.len = sizeof(arg.value.i); error = csw->d_ioctl(dev, DIOCGATTR, (caddr_t)&arg, FREAD, curthread); unmap = (error == 0) ? arg.value.i : 0; } value = dnvlist_get_string(cbe_lun->options, "unmap", NULL); if (value != NULL) unmap = (strcmp(value, "on") == 0); if (unmap) cbe_lun->flags |= CTL_LUN_FLAG_UNMAP; else cbe_lun->flags &= ~CTL_LUN_FLAG_UNMAP; dev_relthread(dev, ref); return (0); } static int ctl_be_block_close(struct ctl_be_block_lun *be_lun) { struct ctl_be_lun *cbe_lun = &be_lun->cbe_lun; int flags; if (be_lun->vn) { flags = FREAD; if ((cbe_lun->flags & CTL_LUN_FLAG_READONLY) == 0) flags |= FWRITE; (void)vn_close(be_lun->vn, flags, NOCRED, curthread); be_lun->vn = NULL; switch (be_lun->dev_type) { case CTL_BE_BLOCK_DEV: break; case CTL_BE_BLOCK_FILE: if (be_lun->backend.file.cred != NULL) { crfree(be_lun->backend.file.cred); be_lun->backend.file.cred = NULL; } break; case CTL_BE_BLOCK_NONE: break; default: panic("Unexpected backend type %d", be_lun->dev_type); break; } be_lun->dev_type = CTL_BE_BLOCK_NONE; } return (0); } static int ctl_be_block_open(struct ctl_be_block_lun *be_lun, struct ctl_lun_req *req) { struct ctl_be_lun *cbe_lun = &be_lun->cbe_lun; struct nameidata nd; const char *value; int error, flags; error = 0; if (rootvnode == NULL) { snprintf(req->error_str, sizeof(req->error_str), "Root filesystem is not mounted"); return (1); } pwd_ensure_dirs(); value = dnvlist_get_string(cbe_lun->options, "file", NULL); if (value == NULL) { snprintf(req->error_str, sizeof(req->error_str), "no file argument specified"); return (1); } free(be_lun->dev_path, M_CTLBLK); be_lun->dev_path = strdup(value, M_CTLBLK); flags = FREAD; value = dnvlist_get_string(cbe_lun->options, "readonly", NULL); if (value != NULL) { if (strcmp(value, "on") != 0) flags |= FWRITE; } else if (cbe_lun->lun_type == T_DIRECT) flags |= FWRITE; again: NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, be_lun->dev_path, curthread); error = vn_open(&nd, &flags, 0, NULL); if ((error == EROFS || error == EACCES) && (flags & FWRITE)) { flags &= ~FWRITE; goto again; } if (error) { /* * This is the only reasonable guess we can make as far as * path if the user doesn't give us a fully qualified path. * If they want to specify a file, they need to specify the * full path. */ if (be_lun->dev_path[0] != '/') { char *dev_name; asprintf(&dev_name, M_CTLBLK, "/dev/%s", be_lun->dev_path); free(be_lun->dev_path, M_CTLBLK); be_lun->dev_path = dev_name; goto again; } snprintf(req->error_str, sizeof(req->error_str), "error opening %s: %d", be_lun->dev_path, error); return (error); } if (flags & FWRITE) cbe_lun->flags &= ~CTL_LUN_FLAG_READONLY; else cbe_lun->flags |= CTL_LUN_FLAG_READONLY; NDFREE(&nd, NDF_ONLY_PNBUF); be_lun->vn = nd.ni_vp; /* We only support disks and files. */ - if (vn_isdisk(be_lun->vn, &error)) { + if (vn_isdisk_error(be_lun->vn, &error)) { error = ctl_be_block_open_dev(be_lun, req); } else if (be_lun->vn->v_type == VREG) { error = ctl_be_block_open_file(be_lun, req); } else { error = EINVAL; snprintf(req->error_str, sizeof(req->error_str), "%s is not a disk or plain file", be_lun->dev_path); } VOP_UNLOCK(be_lun->vn); if (error != 0) ctl_be_block_close(be_lun); cbe_lun->serseq = CTL_LUN_SERSEQ_OFF; if (be_lun->dispatch != ctl_be_block_dispatch_dev) cbe_lun->serseq = CTL_LUN_SERSEQ_READ; value = dnvlist_get_string(cbe_lun->options, "serseq", NULL); if (value != NULL && strcmp(value, "on") == 0) cbe_lun->serseq = CTL_LUN_SERSEQ_ON; else if (value != NULL && strcmp(value, "read") == 0) cbe_lun->serseq = CTL_LUN_SERSEQ_READ; else if (value != NULL && strcmp(value, "off") == 0) cbe_lun->serseq = CTL_LUN_SERSEQ_OFF; return (0); } static int ctl_be_block_create(struct ctl_be_block_softc *softc, struct ctl_lun_req *req) { struct ctl_be_lun *cbe_lun; struct ctl_be_block_lun *be_lun; struct ctl_lun_create_params *params; char num_thread_str[16]; char tmpstr[32]; const char *value; int retval, num_threads; int tmp_num_threads; params = &req->reqdata.create; retval = 0; req->status = CTL_LUN_OK; be_lun = malloc(sizeof(*be_lun), M_CTLBLK, M_ZERO | M_WAITOK); cbe_lun = &be_lun->cbe_lun; be_lun->params = req->reqdata.create; be_lun->softc = softc; STAILQ_INIT(&be_lun->input_queue); STAILQ_INIT(&be_lun->config_read_queue); STAILQ_INIT(&be_lun->config_write_queue); STAILQ_INIT(&be_lun->datamove_queue); mtx_init(&be_lun->io_lock, "ctlblock io", NULL, MTX_DEF); mtx_init(&be_lun->queue_lock, "ctlblock queue", NULL, MTX_DEF); cbe_lun->options = nvlist_clone(req->args_nvl); if (params->flags & CTL_LUN_FLAG_DEV_TYPE) cbe_lun->lun_type = params->device_type; else cbe_lun->lun_type = T_DIRECT; be_lun->flags = 0; cbe_lun->flags = 0; value = dnvlist_get_string(cbe_lun->options, "ha_role", NULL); if (value != NULL) { if (strcmp(value, "primary") == 0) cbe_lun->flags |= CTL_LUN_FLAG_PRIMARY; } else if (control_softc->flags & CTL_FLAG_ACTIVE_SHELF) cbe_lun->flags |= CTL_LUN_FLAG_PRIMARY; if (cbe_lun->lun_type == T_DIRECT || cbe_lun->lun_type == T_CDROM) { be_lun->size_bytes = params->lun_size_bytes; if (params->blocksize_bytes != 0) cbe_lun->blocksize = params->blocksize_bytes; else if (cbe_lun->lun_type == T_CDROM) cbe_lun->blocksize = 2048; else cbe_lun->blocksize = 512; be_lun->size_blocks = be_lun->size_bytes / cbe_lun->blocksize; cbe_lun->maxlba = (be_lun->size_blocks == 0) ? 0 : (be_lun->size_blocks - 1); if ((cbe_lun->flags & CTL_LUN_FLAG_PRIMARY) || control_softc->ha_mode == CTL_HA_MODE_SER_ONLY) { retval = ctl_be_block_open(be_lun, req); if (retval != 0) { retval = 0; req->status = CTL_LUN_WARNING; } } num_threads = cbb_num_threads; } else { num_threads = 1; } value = dnvlist_get_string(cbe_lun->options, "num_threads", NULL); if (value != NULL) { tmp_num_threads = strtol(value, NULL, 0); /* * We don't let the user specify less than one * thread, but hope he's clueful enough not to * specify 1000 threads. */ if (tmp_num_threads < 1) { snprintf(req->error_str, sizeof(req->error_str), "invalid number of threads %s", num_thread_str); goto bailout_error; } num_threads = tmp_num_threads; } if (be_lun->vn == NULL) cbe_lun->flags |= CTL_LUN_FLAG_NO_MEDIA; /* Tell the user the blocksize we ended up using */ params->lun_size_bytes = be_lun->size_bytes; params->blocksize_bytes = cbe_lun->blocksize; if (params->flags & CTL_LUN_FLAG_ID_REQ) { cbe_lun->req_lun_id = params->req_lun_id; cbe_lun->flags |= CTL_LUN_FLAG_ID_REQ; } else cbe_lun->req_lun_id = 0; cbe_lun->lun_shutdown = ctl_be_block_lun_shutdown; cbe_lun->be = &ctl_be_block_driver; if ((params->flags & CTL_LUN_FLAG_SERIAL_NUM) == 0) { snprintf(tmpstr, sizeof(tmpstr), "MYSERIAL%04d", softc->num_luns); strncpy((char *)cbe_lun->serial_num, tmpstr, MIN(sizeof(cbe_lun->serial_num), sizeof(tmpstr))); /* Tell the user what we used for a serial number */ strncpy((char *)params->serial_num, tmpstr, MIN(sizeof(params->serial_num), sizeof(tmpstr))); } else { strncpy((char *)cbe_lun->serial_num, params->serial_num, MIN(sizeof(cbe_lun->serial_num), sizeof(params->serial_num))); } if ((params->flags & CTL_LUN_FLAG_DEVID) == 0) { snprintf(tmpstr, sizeof(tmpstr), "MYDEVID%04d", softc->num_luns); strncpy((char *)cbe_lun->device_id, tmpstr, MIN(sizeof(cbe_lun->device_id), sizeof(tmpstr))); /* Tell the user what we used for a device ID */ strncpy((char *)params->device_id, tmpstr, MIN(sizeof(params->device_id), sizeof(tmpstr))); } else { strncpy((char *)cbe_lun->device_id, params->device_id, MIN(sizeof(cbe_lun->device_id), sizeof(params->device_id))); } TASK_INIT(&be_lun->io_task, /*priority*/0, ctl_be_block_worker, be_lun); be_lun->io_taskqueue = taskqueue_create("ctlblocktq", M_WAITOK, taskqueue_thread_enqueue, /*context*/&be_lun->io_taskqueue); if (be_lun->io_taskqueue == NULL) { snprintf(req->error_str, sizeof(req->error_str), "unable to create taskqueue"); goto bailout_error; } /* * Note that we start the same number of threads by default for * both the file case and the block device case. For the file * case, we need multiple threads to allow concurrency, because the * vnode interface is designed to be a blocking interface. For the * block device case, ZFS zvols at least will block the caller's * context in many instances, and so we need multiple threads to * overcome that problem. Other block devices don't need as many * threads, but they shouldn't cause too many problems. * * If the user wants to just have a single thread for a block * device, he can specify that when the LUN is created, or change * the tunable/sysctl to alter the default number of threads. */ retval = taskqueue_start_threads_in_proc(&be_lun->io_taskqueue, /*num threads*/num_threads, /*priority*/PUSER, /*proc*/control_softc->ctl_proc, /*thread name*/"block"); if (retval != 0) goto bailout_error; be_lun->num_threads = num_threads; retval = ctl_add_lun(&be_lun->cbe_lun); if (retval != 0) { snprintf(req->error_str, sizeof(req->error_str), "ctl_add_lun() returned error %d, see dmesg for " "details", retval); retval = 0; goto bailout_error; } be_lun->disk_stats = devstat_new_entry("cbb", cbe_lun->lun_id, cbe_lun->blocksize, DEVSTAT_ALL_SUPPORTED, cbe_lun->lun_type | DEVSTAT_TYPE_IF_OTHER, DEVSTAT_PRIORITY_OTHER); mtx_lock(&softc->lock); softc->num_luns++; SLIST_INSERT_HEAD(&softc->lun_list, be_lun, links); mtx_unlock(&softc->lock); params->req_lun_id = cbe_lun->lun_id; return (retval); bailout_error: req->status = CTL_LUN_ERROR; if (be_lun->io_taskqueue != NULL) taskqueue_free(be_lun->io_taskqueue); ctl_be_block_close(be_lun); if (be_lun->dev_path != NULL) free(be_lun->dev_path, M_CTLBLK); nvlist_destroy(cbe_lun->options); mtx_destroy(&be_lun->queue_lock); mtx_destroy(&be_lun->io_lock); free(be_lun, M_CTLBLK); return (retval); } static int ctl_be_block_rm(struct ctl_be_block_softc *softc, struct ctl_lun_req *req) { struct ctl_lun_rm_params *params; struct ctl_be_block_lun *be_lun; struct ctl_be_lun *cbe_lun; int retval; params = &req->reqdata.rm; sx_xlock(&softc->modify_lock); mtx_lock(&softc->lock); SLIST_FOREACH(be_lun, &softc->lun_list, links) { if (be_lun->cbe_lun.lun_id == params->lun_id) { SLIST_REMOVE(&softc->lun_list, be_lun, ctl_be_block_lun, links); softc->num_luns--; break; } } mtx_unlock(&softc->lock); sx_xunlock(&softc->modify_lock); if (be_lun == NULL) { snprintf(req->error_str, sizeof(req->error_str), "LUN %u is not managed by the block backend", params->lun_id); goto bailout_error; } cbe_lun = &be_lun->cbe_lun; if (be_lun->vn != NULL) { cbe_lun->flags |= CTL_LUN_FLAG_NO_MEDIA; ctl_lun_no_media(cbe_lun); taskqueue_drain_all(be_lun->io_taskqueue); ctl_be_block_close(be_lun); } mtx_lock(&softc->lock); be_lun->flags |= CTL_BE_BLOCK_LUN_WAITING; mtx_unlock(&softc->lock); retval = ctl_remove_lun(cbe_lun); if (retval != 0) { snprintf(req->error_str, sizeof(req->error_str), "error %d returned from ctl_remove_lun() for " "LUN %d", retval, params->lun_id); mtx_lock(&softc->lock); be_lun->flags &= ~CTL_BE_BLOCK_LUN_WAITING; mtx_unlock(&softc->lock); goto bailout_error; } mtx_lock(&softc->lock); while ((be_lun->flags & CTL_BE_BLOCK_LUN_UNCONFIGURED) == 0) { retval = msleep(be_lun, &softc->lock, PCATCH, "ctlblockrm", 0); if (retval == EINTR) break; } be_lun->flags &= ~CTL_BE_BLOCK_LUN_WAITING; if (be_lun->flags & CTL_BE_BLOCK_LUN_UNCONFIGURED) { mtx_unlock(&softc->lock); free(be_lun, M_CTLBLK); } else { mtx_unlock(&softc->lock); return (EINTR); } req->status = CTL_LUN_OK; return (0); bailout_error: req->status = CTL_LUN_ERROR; return (0); } static int ctl_be_block_modify(struct ctl_be_block_softc *softc, struct ctl_lun_req *req) { struct ctl_lun_modify_params *params; struct ctl_be_block_lun *be_lun; struct ctl_be_lun *cbe_lun; const char *value; uint64_t oldsize; int error, wasprim; params = &req->reqdata.modify; sx_xlock(&softc->modify_lock); mtx_lock(&softc->lock); SLIST_FOREACH(be_lun, &softc->lun_list, links) { if (be_lun->cbe_lun.lun_id == params->lun_id) break; } mtx_unlock(&softc->lock); if (be_lun == NULL) { snprintf(req->error_str, sizeof(req->error_str), "LUN %u is not managed by the block backend", params->lun_id); goto bailout_error; } cbe_lun = &be_lun->cbe_lun; if (params->lun_size_bytes != 0) be_lun->params.lun_size_bytes = params->lun_size_bytes; if (req->args_nvl != NULL) { nvlist_destroy(cbe_lun->options); cbe_lun->options = nvlist_clone(req->args_nvl); } wasprim = (cbe_lun->flags & CTL_LUN_FLAG_PRIMARY); value = dnvlist_get_string(cbe_lun->options, "ha_role", NULL); if (value != NULL) { if (strcmp(value, "primary") == 0) cbe_lun->flags |= CTL_LUN_FLAG_PRIMARY; else cbe_lun->flags &= ~CTL_LUN_FLAG_PRIMARY; } else if (control_softc->flags & CTL_FLAG_ACTIVE_SHELF) cbe_lun->flags |= CTL_LUN_FLAG_PRIMARY; else cbe_lun->flags &= ~CTL_LUN_FLAG_PRIMARY; if (wasprim != (cbe_lun->flags & CTL_LUN_FLAG_PRIMARY)) { if (cbe_lun->flags & CTL_LUN_FLAG_PRIMARY) ctl_lun_primary(cbe_lun); else ctl_lun_secondary(cbe_lun); } oldsize = be_lun->size_blocks; if ((cbe_lun->flags & CTL_LUN_FLAG_PRIMARY) || control_softc->ha_mode == CTL_HA_MODE_SER_ONLY) { if (be_lun->vn == NULL) error = ctl_be_block_open(be_lun, req); - else if (vn_isdisk(be_lun->vn, &error)) + else if (vn_isdisk_error(be_lun->vn, &error)) error = ctl_be_block_open_dev(be_lun, req); else if (be_lun->vn->v_type == VREG) { vn_lock(be_lun->vn, LK_SHARED | LK_RETRY); error = ctl_be_block_open_file(be_lun, req); VOP_UNLOCK(be_lun->vn); } else error = EINVAL; if ((cbe_lun->flags & CTL_LUN_FLAG_NO_MEDIA) && be_lun->vn != NULL) { cbe_lun->flags &= ~CTL_LUN_FLAG_NO_MEDIA; ctl_lun_has_media(cbe_lun); } else if ((cbe_lun->flags & CTL_LUN_FLAG_NO_MEDIA) == 0 && be_lun->vn == NULL) { cbe_lun->flags |= CTL_LUN_FLAG_NO_MEDIA; ctl_lun_no_media(cbe_lun); } cbe_lun->flags &= ~CTL_LUN_FLAG_EJECTED; } else { if (be_lun->vn != NULL) { cbe_lun->flags |= CTL_LUN_FLAG_NO_MEDIA; ctl_lun_no_media(cbe_lun); taskqueue_drain_all(be_lun->io_taskqueue); error = ctl_be_block_close(be_lun); } else error = 0; } if (be_lun->size_blocks != oldsize) ctl_lun_capacity_changed(cbe_lun); /* Tell the user the exact size we ended up using */ params->lun_size_bytes = be_lun->size_bytes; sx_xunlock(&softc->modify_lock); req->status = error ? CTL_LUN_WARNING : CTL_LUN_OK; return (0); bailout_error: sx_xunlock(&softc->modify_lock); req->status = CTL_LUN_ERROR; return (0); } static void ctl_be_block_lun_shutdown(struct ctl_be_lun *cbe_lun) { struct ctl_be_block_lun *be_lun = (struct ctl_be_block_lun *)cbe_lun; struct ctl_be_block_softc *softc = be_lun->softc; taskqueue_drain_all(be_lun->io_taskqueue); taskqueue_free(be_lun->io_taskqueue); if (be_lun->disk_stats != NULL) devstat_remove_entry(be_lun->disk_stats); nvlist_destroy(be_lun->cbe_lun.options); free(be_lun->dev_path, M_CTLBLK); mtx_destroy(&be_lun->queue_lock); mtx_destroy(&be_lun->io_lock); mtx_lock(&softc->lock); be_lun->flags |= CTL_BE_BLOCK_LUN_UNCONFIGURED; if (be_lun->flags & CTL_BE_BLOCK_LUN_WAITING) wakeup(be_lun); else free(be_lun, M_CTLBLK); mtx_unlock(&softc->lock); } static int ctl_be_block_config_write(union ctl_io *io) { struct ctl_be_block_lun *be_lun; struct ctl_be_lun *cbe_lun; int retval; DPRINTF("entered\n"); cbe_lun = CTL_BACKEND_LUN(io); be_lun = (struct ctl_be_block_lun *)cbe_lun; retval = 0; switch (io->scsiio.cdb[0]) { case SYNCHRONIZE_CACHE: case SYNCHRONIZE_CACHE_16: case WRITE_SAME_10: case WRITE_SAME_16: case UNMAP: /* * The upper level CTL code will filter out any CDBs with * the immediate bit set and return the proper error. * * We don't really need to worry about what LBA range the * user asked to be synced out. When they issue a sync * cache command, we'll sync out the whole thing. */ mtx_lock(&be_lun->queue_lock); STAILQ_INSERT_TAIL(&be_lun->config_write_queue, &io->io_hdr, links); mtx_unlock(&be_lun->queue_lock); taskqueue_enqueue(be_lun->io_taskqueue, &be_lun->io_task); break; case START_STOP_UNIT: { struct scsi_start_stop_unit *cdb; struct ctl_lun_req req; cdb = (struct scsi_start_stop_unit *)io->scsiio.cdb; if ((cdb->how & SSS_PC_MASK) != 0) { ctl_set_success(&io->scsiio); ctl_config_write_done(io); break; } if (cdb->how & SSS_START) { if ((cdb->how & SSS_LOEJ) && be_lun->vn == NULL) { retval = ctl_be_block_open(be_lun, &req); cbe_lun->flags &= ~CTL_LUN_FLAG_EJECTED; if (retval == 0) { cbe_lun->flags &= ~CTL_LUN_FLAG_NO_MEDIA; ctl_lun_has_media(cbe_lun); } else { cbe_lun->flags |= CTL_LUN_FLAG_NO_MEDIA; ctl_lun_no_media(cbe_lun); } } ctl_start_lun(cbe_lun); } else { ctl_stop_lun(cbe_lun); if (cdb->how & SSS_LOEJ) { cbe_lun->flags |= CTL_LUN_FLAG_NO_MEDIA; cbe_lun->flags |= CTL_LUN_FLAG_EJECTED; ctl_lun_ejected(cbe_lun); if (be_lun->vn != NULL) ctl_be_block_close(be_lun); } } ctl_set_success(&io->scsiio); ctl_config_write_done(io); break; } case PREVENT_ALLOW: ctl_set_success(&io->scsiio); ctl_config_write_done(io); break; default: ctl_set_invalid_opcode(&io->scsiio); ctl_config_write_done(io); retval = CTL_RETVAL_COMPLETE; break; } return (retval); } static int ctl_be_block_config_read(union ctl_io *io) { struct ctl_be_block_lun *be_lun; int retval = 0; DPRINTF("entered\n"); be_lun = (struct ctl_be_block_lun *)CTL_BACKEND_LUN(io); switch (io->scsiio.cdb[0]) { case SERVICE_ACTION_IN: if (io->scsiio.cdb[1] == SGLS_SERVICE_ACTION) { mtx_lock(&be_lun->queue_lock); STAILQ_INSERT_TAIL(&be_lun->config_read_queue, &io->io_hdr, links); mtx_unlock(&be_lun->queue_lock); taskqueue_enqueue(be_lun->io_taskqueue, &be_lun->io_task); retval = CTL_RETVAL_QUEUED; break; } ctl_set_invalid_field(&io->scsiio, /*sks_valid*/ 1, /*command*/ 1, /*field*/ 1, /*bit_valid*/ 1, /*bit*/ 4); ctl_config_read_done(io); retval = CTL_RETVAL_COMPLETE; break; default: ctl_set_invalid_opcode(&io->scsiio); ctl_config_read_done(io); retval = CTL_RETVAL_COMPLETE; break; } return (retval); } static int ctl_be_block_lun_info(struct ctl_be_lun *cbe_lun, struct sbuf *sb) { struct ctl_be_block_lun *lun = (struct ctl_be_block_lun *)cbe_lun; int retval; retval = sbuf_printf(sb, "\t"); if (retval != 0) goto bailout; retval = sbuf_printf(sb, "%d", lun->num_threads); if (retval != 0) goto bailout; retval = sbuf_printf(sb, "\n"); bailout: return (retval); } static uint64_t ctl_be_block_lun_attr(struct ctl_be_lun *cbe_lun, const char *attrname) { struct ctl_be_block_lun *lun = (struct ctl_be_block_lun *)cbe_lun; if (lun->getattr == NULL) return (UINT64_MAX); return (lun->getattr(lun, attrname)); } static int ctl_be_block_init(void) { struct ctl_be_block_softc *softc = &backend_block_softc; sx_init(&softc->modify_lock, "ctlblock modify"); mtx_init(&softc->lock, "ctlblock", NULL, MTX_DEF); softc->beio_zone = uma_zcreate("beio", sizeof(struct ctl_be_block_io), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); softc->buf_zone = uma_zcreate("ctlblock", CTLBLK_MAX_SEG, NULL, NULL, NULL, NULL, /*align*/ 0, /*flags*/0); SLIST_INIT(&softc->lun_list); return (0); } static int ctl_be_block_shutdown(void) { struct ctl_be_block_softc *softc = &backend_block_softc; struct ctl_be_block_lun *lun; mtx_lock(&softc->lock); while ((lun = SLIST_FIRST(&softc->lun_list)) != NULL) { SLIST_REMOVE_HEAD(&softc->lun_list, links); softc->num_luns--; /* * Drop our lock here. Since ctl_remove_lun() can call * back into us, this could potentially lead to a recursive * lock of the same mutex, which would cause a hang. */ mtx_unlock(&softc->lock); ctl_remove_lun(&lun->cbe_lun); mtx_lock(&softc->lock); } mtx_unlock(&softc->lock); uma_zdestroy(softc->buf_zone); uma_zdestroy(softc->beio_zone); mtx_destroy(&softc->lock); sx_destroy(&softc->modify_lock); return (0); } Index: head/sys/compat/linux/linux_stats.c =================================================================== --- head/sys/compat/linux/linux_stats.c (revision 364371) +++ head/sys/compat/linux/linux_stats.c (revision 364372) @@ -1,729 +1,729 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 1994-1995 Søren Schmidt * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_compat.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef COMPAT_LINUX32 #include #include #else #include #include #endif #include #include static void translate_vnhook_major_minor(struct vnode *vp, struct stat *sb) { int major, minor; - if (vn_isdisk(vp, NULL)) { + if (vn_isdisk(vp)) { sb->st_mode &= ~S_IFMT; sb->st_mode |= S_IFBLK; } /* * Return the same st_dev for every devfs instance. The reason * for this is to work around an idiosyncrasy of glibc getttynam() * implementation: it checks whether st_dev returned for fd 0 * is the same as st_dev returned for the target of /proc/self/fd/0 * symlink, and with linux chroots having their own devfs instance, * the check will fail if you chroot into it. */ if (rootdevmp != NULL && vp->v_mount->mnt_vfc == rootdevmp->mnt_vfc) sb->st_dev = rootdevmp->mnt_stat.f_fsid.val[0]; if (vp->v_type == VCHR && vp->v_rdev != NULL && linux_driver_get_major_minor(devtoname(vp->v_rdev), &major, &minor) == 0) { sb->st_rdev = (major << 8 | minor); } } static int linux_kern_statat(struct thread *td, int flag, int fd, const char *path, enum uio_seg pathseg, struct stat *sbp) { return (kern_statat(td, flag, fd, path, pathseg, sbp, translate_vnhook_major_minor)); } #ifdef LINUX_LEGACY_SYSCALLS static int linux_kern_stat(struct thread *td, const char *path, enum uio_seg pathseg, struct stat *sbp) { return (linux_kern_statat(td, 0, AT_FDCWD, path, pathseg, sbp)); } static int linux_kern_lstat(struct thread *td, const char *path, enum uio_seg pathseg, struct stat *sbp) { return (linux_kern_statat(td, AT_SYMLINK_NOFOLLOW, AT_FDCWD, path, pathseg, sbp)); } #endif static void translate_fd_major_minor(struct thread *td, int fd, struct stat *buf) { struct file *fp; struct vnode *vp; struct mount *mp; int major, minor; /* * No capability rights required here. */ if ((!S_ISCHR(buf->st_mode) && !S_ISBLK(buf->st_mode)) || fget(td, fd, &cap_no_rights, &fp) != 0) return; vp = fp->f_vnode; - if (vp != NULL && vn_isdisk(vp, NULL)) { + if (vp != NULL && vn_isdisk(vp)) { buf->st_mode &= ~S_IFMT; buf->st_mode |= S_IFBLK; } if (vp != NULL && rootdevmp != NULL) { mp = vp->v_mount; __compiler_membar(); if (mp != NULL && mp->mnt_vfc == rootdevmp->mnt_vfc) buf->st_dev = rootdevmp->mnt_stat.f_fsid.val[0]; } if (vp != NULL && vp->v_rdev != NULL && linux_driver_get_major_minor(devtoname(vp->v_rdev), &major, &minor) == 0) { buf->st_rdev = (major << 8 | minor); } else if (fp->f_type == DTYPE_PTS) { struct tty *tp = fp->f_data; /* Convert the numbers for the slave device. */ if (linux_driver_get_major_minor(devtoname(tp->t_dev), &major, &minor) == 0) { buf->st_rdev = (major << 8 | minor); } } fdrop(fp, td); } /* * l_dev_t has the same encoding as dev_t in the latter's low 16 bits, so * truncation of a dev_t to 16 bits gives the same result as unpacking * using major() and minor() and repacking in the l_dev_t format. This * detail is hidden in dev_to_ldev(). Overflow in conversions of dev_t's * are not checked for, as for other fields. * * dev_to_ldev() is only used for translating st_dev. When we convert * st_rdev for copying it out, it isn't really a dev_t, but has already * been translated to an l_dev_t in a nontrivial way. Translating it * again would be illogical but would have no effect since the low 16 * bits have the same encoding. * * The nontrivial translation for st_rdev renumbers some devices, but not * ones that can be mounted on, so it is consistent with the translation * for st_dev except when the renumbering or truncation causes conflicts. */ #define dev_to_ldev(d) ((uint16_t)(d)) static int newstat_copyout(struct stat *buf, void *ubuf) { struct l_newstat tbuf; bzero(&tbuf, sizeof(tbuf)); tbuf.st_dev = dev_to_ldev(buf->st_dev); tbuf.st_ino = buf->st_ino; tbuf.st_mode = buf->st_mode; tbuf.st_nlink = buf->st_nlink; tbuf.st_uid = buf->st_uid; tbuf.st_gid = buf->st_gid; tbuf.st_rdev = buf->st_rdev; tbuf.st_size = buf->st_size; tbuf.st_atim.tv_sec = buf->st_atim.tv_sec; tbuf.st_atim.tv_nsec = buf->st_atim.tv_nsec; tbuf.st_mtim.tv_sec = buf->st_mtim.tv_sec; tbuf.st_mtim.tv_nsec = buf->st_mtim.tv_nsec; tbuf.st_ctim.tv_sec = buf->st_ctim.tv_sec; tbuf.st_ctim.tv_nsec = buf->st_ctim.tv_nsec; tbuf.st_blksize = buf->st_blksize; tbuf.st_blocks = buf->st_blocks; return (copyout(&tbuf, ubuf, sizeof(tbuf))); } #ifdef LINUX_LEGACY_SYSCALLS int linux_newstat(struct thread *td, struct linux_newstat_args *args) { struct stat buf; char *path; int error; if (!LUSECONVPATH(td)) { error = linux_kern_stat(td, args->path, UIO_USERSPACE, &buf); } else { LCONVPATHEXIST(td, args->path, &path); error = linux_kern_stat(td, path, UIO_SYSSPACE, &buf); LFREEPATH(path); } if (error) return (error); return (newstat_copyout(&buf, args->buf)); } int linux_newlstat(struct thread *td, struct linux_newlstat_args *args) { struct stat sb; char *path; int error; if (!LUSECONVPATH(td)) { error = linux_kern_lstat(td, args->path, UIO_USERSPACE, &sb); } else { LCONVPATHEXIST(td, args->path, &path); error = linux_kern_lstat(td, path, UIO_SYSSPACE, &sb); LFREEPATH(path); } if (error) return (error); return (newstat_copyout(&sb, args->buf)); } #endif int linux_newfstat(struct thread *td, struct linux_newfstat_args *args) { struct stat buf; int error; error = kern_fstat(td, args->fd, &buf); translate_fd_major_minor(td, args->fd, &buf); if (!error) error = newstat_copyout(&buf, args->buf); return (error); } #if defined(__i386__) || (defined(__amd64__) && defined(COMPAT_LINUX32)) static int stat_copyout(struct stat *buf, void *ubuf) { struct l_stat lbuf; bzero(&lbuf, sizeof(lbuf)); lbuf.st_dev = dev_to_ldev(buf->st_dev); lbuf.st_ino = buf->st_ino; lbuf.st_mode = buf->st_mode; lbuf.st_nlink = buf->st_nlink; lbuf.st_uid = buf->st_uid; lbuf.st_gid = buf->st_gid; lbuf.st_rdev = buf->st_rdev; lbuf.st_size = MIN(buf->st_size, INT32_MAX); lbuf.st_atim.tv_sec = buf->st_atim.tv_sec; lbuf.st_atim.tv_nsec = buf->st_atim.tv_nsec; lbuf.st_mtim.tv_sec = buf->st_mtim.tv_sec; lbuf.st_mtim.tv_nsec = buf->st_mtim.tv_nsec; lbuf.st_ctim.tv_sec = buf->st_ctim.tv_sec; lbuf.st_ctim.tv_nsec = buf->st_ctim.tv_nsec; lbuf.st_blksize = buf->st_blksize; lbuf.st_blocks = buf->st_blocks; lbuf.st_flags = buf->st_flags; lbuf.st_gen = buf->st_gen; return (copyout(&lbuf, ubuf, sizeof(lbuf))); } int linux_stat(struct thread *td, struct linux_stat_args *args) { struct stat buf; char *path; int error; if (!LUSECONVPATH(td)) { error = linux_kern_stat(td, args->path, UIO_USERSPACE, &buf); } else { LCONVPATHEXIST(td, args->path, &path); error = linux_kern_stat(td, path, UIO_SYSSPACE, &buf); LFREEPATH(path); } if (error) { return (error); } return (stat_copyout(&buf, args->up)); } int linux_lstat(struct thread *td, struct linux_lstat_args *args) { struct stat buf; char *path; int error; if (!LUSECONVPATH(td)) { error = linux_kern_lstat(td, args->path, UIO_USERSPACE, &buf); } else { LCONVPATHEXIST(td, args->path, &path); error = linux_kern_lstat(td, path, UIO_SYSSPACE, &buf); LFREEPATH(path); } if (error) { return (error); } return (stat_copyout(&buf, args->up)); } #endif /* __i386__ || (__amd64__ && COMPAT_LINUX32) */ struct l_statfs { l_long f_type; l_long f_bsize; l_long f_blocks; l_long f_bfree; l_long f_bavail; l_long f_files; l_long f_ffree; l_fsid_t f_fsid; l_long f_namelen; l_long f_frsize; l_long f_flags; l_long f_spare[4]; }; #define LINUX_CODA_SUPER_MAGIC 0x73757245L #define LINUX_EXT2_SUPER_MAGIC 0xEF53L #define LINUX_HPFS_SUPER_MAGIC 0xf995e849L #define LINUX_ISOFS_SUPER_MAGIC 0x9660L #define LINUX_MSDOS_SUPER_MAGIC 0x4d44L #define LINUX_NCP_SUPER_MAGIC 0x564cL #define LINUX_NFS_SUPER_MAGIC 0x6969L #define LINUX_NTFS_SUPER_MAGIC 0x5346544EL #define LINUX_PROC_SUPER_MAGIC 0x9fa0L #define LINUX_UFS_SUPER_MAGIC 0x00011954L /* XXX - UFS_MAGIC in Linux */ #define LINUX_ZFS_SUPER_MAGIC 0x2FC12FC1 #define LINUX_DEVFS_SUPER_MAGIC 0x1373L #define LINUX_SHMFS_MAGIC 0x01021994 static long bsd_to_linux_ftype(const char *fstypename) { int i; static struct {const char *bsd_name; long linux_type;} b2l_tbl[] = { {"ufs", LINUX_UFS_SUPER_MAGIC}, {"zfs", LINUX_ZFS_SUPER_MAGIC}, {"cd9660", LINUX_ISOFS_SUPER_MAGIC}, {"nfs", LINUX_NFS_SUPER_MAGIC}, {"ext2fs", LINUX_EXT2_SUPER_MAGIC}, {"procfs", LINUX_PROC_SUPER_MAGIC}, {"msdosfs", LINUX_MSDOS_SUPER_MAGIC}, {"ntfs", LINUX_NTFS_SUPER_MAGIC}, {"nwfs", LINUX_NCP_SUPER_MAGIC}, {"hpfs", LINUX_HPFS_SUPER_MAGIC}, {"coda", LINUX_CODA_SUPER_MAGIC}, {"devfs", LINUX_DEVFS_SUPER_MAGIC}, {"tmpfs", LINUX_SHMFS_MAGIC}, {NULL, 0L}}; for (i = 0; b2l_tbl[i].bsd_name != NULL; i++) if (strcmp(b2l_tbl[i].bsd_name, fstypename) == 0) return (b2l_tbl[i].linux_type); return (0L); } static int bsd_to_linux_statfs(struct statfs *bsd_statfs, struct l_statfs *linux_statfs) { #if defined(__i386__) || (defined(__amd64__) && defined(COMPAT_LINUX32)) uint64_t tmp; #define LINUX_HIBITS 0xffffffff00000000ULL tmp = bsd_statfs->f_blocks | bsd_statfs->f_bfree | bsd_statfs->f_files | bsd_statfs->f_bsize; if ((bsd_statfs->f_bavail != -1 && (bsd_statfs->f_bavail & LINUX_HIBITS)) || (bsd_statfs->f_ffree != -1 && (bsd_statfs->f_ffree & LINUX_HIBITS)) || (tmp & LINUX_HIBITS)) return (EOVERFLOW); #undef LINUX_HIBITS #endif linux_statfs->f_type = bsd_to_linux_ftype(bsd_statfs->f_fstypename); linux_statfs->f_bsize = bsd_statfs->f_bsize; linux_statfs->f_blocks = bsd_statfs->f_blocks; linux_statfs->f_bfree = bsd_statfs->f_bfree; linux_statfs->f_bavail = bsd_statfs->f_bavail; linux_statfs->f_ffree = bsd_statfs->f_ffree; linux_statfs->f_files = bsd_statfs->f_files; linux_statfs->f_fsid.val[0] = bsd_statfs->f_fsid.val[0]; linux_statfs->f_fsid.val[1] = bsd_statfs->f_fsid.val[1]; linux_statfs->f_namelen = MAXNAMLEN; linux_statfs->f_frsize = bsd_statfs->f_bsize; linux_statfs->f_flags = 0; memset(linux_statfs->f_spare, 0, sizeof(linux_statfs->f_spare)); return (0); } int linux_statfs(struct thread *td, struct linux_statfs_args *args) { struct l_statfs linux_statfs; struct statfs *bsd_statfs; char *path; int error; if (!LUSECONVPATH(td)) { bsd_statfs = malloc(sizeof(struct statfs), M_STATFS, M_WAITOK); error = kern_statfs(td, args->path, UIO_USERSPACE, bsd_statfs); } else { LCONVPATHEXIST(td, args->path, &path); bsd_statfs = malloc(sizeof(struct statfs), M_STATFS, M_WAITOK); error = kern_statfs(td, path, UIO_SYSSPACE, bsd_statfs); LFREEPATH(path); } if (error == 0) error = bsd_to_linux_statfs(bsd_statfs, &linux_statfs); free(bsd_statfs, M_STATFS); if (error != 0) return (error); return (copyout(&linux_statfs, args->buf, sizeof(linux_statfs))); } #if defined(__i386__) || (defined(__amd64__) && defined(COMPAT_LINUX32)) static void bsd_to_linux_statfs64(struct statfs *bsd_statfs, struct l_statfs64 *linux_statfs) { linux_statfs->f_type = bsd_to_linux_ftype(bsd_statfs->f_fstypename); linux_statfs->f_bsize = bsd_statfs->f_bsize; linux_statfs->f_blocks = bsd_statfs->f_blocks; linux_statfs->f_bfree = bsd_statfs->f_bfree; linux_statfs->f_bavail = bsd_statfs->f_bavail; linux_statfs->f_ffree = bsd_statfs->f_ffree; linux_statfs->f_files = bsd_statfs->f_files; linux_statfs->f_fsid.val[0] = bsd_statfs->f_fsid.val[0]; linux_statfs->f_fsid.val[1] = bsd_statfs->f_fsid.val[1]; linux_statfs->f_namelen = MAXNAMLEN; linux_statfs->f_frsize = bsd_statfs->f_bsize; linux_statfs->f_flags = 0; memset(linux_statfs->f_spare, 0, sizeof(linux_statfs->f_spare)); } int linux_statfs64(struct thread *td, struct linux_statfs64_args *args) { struct l_statfs64 linux_statfs; struct statfs *bsd_statfs; char *path; int error; if (args->bufsize != sizeof(struct l_statfs64)) return (EINVAL); if (!LUSECONVPATH(td)) { bsd_statfs = malloc(sizeof(struct statfs), M_STATFS, M_WAITOK); error = kern_statfs(td, args->path, UIO_USERSPACE, bsd_statfs); } else { LCONVPATHEXIST(td, args->path, &path); bsd_statfs = malloc(sizeof(struct statfs), M_STATFS, M_WAITOK); error = kern_statfs(td, path, UIO_SYSSPACE, bsd_statfs); LFREEPATH(path); } if (error == 0) bsd_to_linux_statfs64(bsd_statfs, &linux_statfs); free(bsd_statfs, M_STATFS); if (error != 0) return (error); return (copyout(&linux_statfs, args->buf, sizeof(linux_statfs))); } int linux_fstatfs64(struct thread *td, struct linux_fstatfs64_args *args) { struct l_statfs64 linux_statfs; struct statfs *bsd_statfs; int error; if (args->bufsize != sizeof(struct l_statfs64)) return (EINVAL); bsd_statfs = malloc(sizeof(struct statfs), M_STATFS, M_WAITOK); error = kern_fstatfs(td, args->fd, bsd_statfs); if (error == 0) bsd_to_linux_statfs64(bsd_statfs, &linux_statfs); free(bsd_statfs, M_STATFS); if (error != 0) return (error); return (copyout(&linux_statfs, args->buf, sizeof(linux_statfs))); } #endif /* __i386__ || (__amd64__ && COMPAT_LINUX32) */ int linux_fstatfs(struct thread *td, struct linux_fstatfs_args *args) { struct l_statfs linux_statfs; struct statfs *bsd_statfs; int error; bsd_statfs = malloc(sizeof(struct statfs), M_STATFS, M_WAITOK); error = kern_fstatfs(td, args->fd, bsd_statfs); if (error == 0) error = bsd_to_linux_statfs(bsd_statfs, &linux_statfs); free(bsd_statfs, M_STATFS); if (error != 0) return (error); return (copyout(&linux_statfs, args->buf, sizeof(linux_statfs))); } struct l_ustat { l_daddr_t f_tfree; l_ino_t f_tinode; char f_fname[6]; char f_fpack[6]; }; #ifdef LINUX_LEGACY_SYSCALLS int linux_ustat(struct thread *td, struct linux_ustat_args *args) { return (EOPNOTSUPP); } #endif #if defined(__i386__) || (defined(__amd64__) && defined(COMPAT_LINUX32)) static int stat64_copyout(struct stat *buf, void *ubuf) { struct l_stat64 lbuf; bzero(&lbuf, sizeof(lbuf)); lbuf.st_dev = dev_to_ldev(buf->st_dev); lbuf.st_ino = buf->st_ino; lbuf.st_mode = buf->st_mode; lbuf.st_nlink = buf->st_nlink; lbuf.st_uid = buf->st_uid; lbuf.st_gid = buf->st_gid; lbuf.st_rdev = buf->st_rdev; lbuf.st_size = buf->st_size; lbuf.st_atim.tv_sec = buf->st_atim.tv_sec; lbuf.st_atim.tv_nsec = buf->st_atim.tv_nsec; lbuf.st_mtim.tv_sec = buf->st_mtim.tv_sec; lbuf.st_mtim.tv_nsec = buf->st_mtim.tv_nsec; lbuf.st_ctim.tv_sec = buf->st_ctim.tv_sec; lbuf.st_ctim.tv_nsec = buf->st_ctim.tv_nsec; lbuf.st_blksize = buf->st_blksize; lbuf.st_blocks = buf->st_blocks; /* * The __st_ino field makes all the difference. In the Linux kernel * it is conditionally compiled based on STAT64_HAS_BROKEN_ST_INO, * but without the assignment to __st_ino the runtime linker refuses * to mmap(2) any shared libraries. I guess it's broken alright :-) */ lbuf.__st_ino = buf->st_ino; return (copyout(&lbuf, ubuf, sizeof(lbuf))); } int linux_stat64(struct thread *td, struct linux_stat64_args *args) { struct stat buf; char *filename; int error; if (!LUSECONVPATH(td)) { error = linux_kern_stat(td, args->filename, UIO_USERSPACE, &buf); } else { LCONVPATHEXIST(td, args->filename, &filename); error = linux_kern_stat(td, filename, UIO_SYSSPACE, &buf); LFREEPATH(filename); } if (error) return (error); return (stat64_copyout(&buf, args->statbuf)); } int linux_lstat64(struct thread *td, struct linux_lstat64_args *args) { struct stat sb; char *filename; int error; if (!LUSECONVPATH(td)) { error = linux_kern_lstat(td, args->filename, UIO_USERSPACE, &sb); } else { LCONVPATHEXIST(td, args->filename, &filename); error = linux_kern_lstat(td, filename, UIO_SYSSPACE, &sb); LFREEPATH(filename); } if (error) return (error); return (stat64_copyout(&sb, args->statbuf)); } int linux_fstat64(struct thread *td, struct linux_fstat64_args *args) { struct stat buf; int error; error = kern_fstat(td, args->fd, &buf); translate_fd_major_minor(td, args->fd, &buf); if (!error) error = stat64_copyout(&buf, args->statbuf); return (error); } int linux_fstatat64(struct thread *td, struct linux_fstatat64_args *args) { char *path; int error, dfd, flag; struct stat buf; if (args->flag & ~LINUX_AT_SYMLINK_NOFOLLOW) return (EINVAL); flag = (args->flag & LINUX_AT_SYMLINK_NOFOLLOW) ? AT_SYMLINK_NOFOLLOW : 0; dfd = (args->dfd == LINUX_AT_FDCWD) ? AT_FDCWD : args->dfd; if (!LUSECONVPATH(td)) { error = linux_kern_statat(td, flag, dfd, args->pathname, UIO_USERSPACE, &buf); } else { LCONVPATHEXIST_AT(td, args->pathname, &path, dfd); error = linux_kern_statat(td, flag, dfd, path, UIO_SYSSPACE, &buf); LFREEPATH(path); } if (error == 0) error = stat64_copyout(&buf, args->statbuf); return (error); } #else /* __amd64__ && !COMPAT_LINUX32 */ int linux_newfstatat(struct thread *td, struct linux_newfstatat_args *args) { char *path; int error, dfd, flag; struct stat buf; if (args->flag & ~LINUX_AT_SYMLINK_NOFOLLOW) return (EINVAL); flag = (args->flag & LINUX_AT_SYMLINK_NOFOLLOW) ? AT_SYMLINK_NOFOLLOW : 0; dfd = (args->dfd == LINUX_AT_FDCWD) ? AT_FDCWD : args->dfd; if (!LUSECONVPATH(td)) { error = linux_kern_statat(td, flag, dfd, args->pathname, UIO_USERSPACE, &buf); } else { LCONVPATHEXIST_AT(td, args->pathname, &path, dfd); error = linux_kern_statat(td, flag, dfd, path, UIO_SYSSPACE, &buf); LFREEPATH(path); } if (error == 0) error = newstat_copyout(&buf, args->statbuf); return (error); } #endif /* __i386__ || (__amd64__ && COMPAT_LINUX32) */ int linux_syncfs(struct thread *td, struct linux_syncfs_args *args) { struct mount *mp; struct vnode *vp; int error, save; error = fgetvp(td, args->fd, &cap_fsync_rights, &vp); if (error != 0) /* * Linux syncfs() returns only EBADF, however fgetvp() * can return EINVAL in case of file descriptor does * not represent a vnode. XXX. */ return (error); mp = vp->v_mount; mtx_lock(&mountlist_mtx); error = vfs_busy(mp, MBF_MNTLSTLOCK); if (error != 0) { /* See comment above. */ mtx_unlock(&mountlist_mtx); goto out; } if ((mp->mnt_flag & MNT_RDONLY) == 0 && vn_start_write(NULL, &mp, V_NOWAIT) == 0) { save = curthread_pflags_set(TDP_SYNCIO); vfs_periodic(mp, MNT_NOWAIT); VFS_SYNC(mp, MNT_NOWAIT); curthread_pflags_restore(save); vn_finished_write(mp); } vfs_unbusy(mp); out: vrele(vp); return (error); } Index: head/sys/dev/xen/blkback/blkback.c =================================================================== --- head/sys/dev/xen/blkback/blkback.c (revision 364371) +++ head/sys/dev/xen/blkback/blkback.c (revision 364372) @@ -1,3944 +1,3944 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2009-2012 Spectra Logic Corporation * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions, and the following disclaimer, * without modification. * 2. Redistributions in binary form must reproduce at minimum a disclaimer * substantially similar to the "NO WARRANTY" disclaimer below * ("Disclaimer") and any redistribution must be conditioned upon * including a substantially similar Disclaimer requirement for further * binary redistribution. * * NO WARRANTY * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGES. * * Authors: Justin T. Gibbs (Spectra Logic Corporation) * Ken Merry (Spectra Logic Corporation) */ #include __FBSDID("$FreeBSD$"); /** * \file blkback.c * * \brief Device driver supporting the vending of block storage from * a FreeBSD domain to other domains. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /*--------------------------- Compile-time Tunables --------------------------*/ /** * The maximum number of shared memory ring pages we will allow in a * negotiated block-front/back communication channel. Allow enough * ring space for all requests to be XBB_MAX_REQUEST_SIZE'd. */ #define XBB_MAX_RING_PAGES 32 /** * The maximum number of outstanding request blocks (request headers plus * additional segment blocks) we will allow in a negotiated block-front/back * communication channel. */ #define XBB_MAX_REQUESTS \ __CONST_RING_SIZE(blkif, PAGE_SIZE * XBB_MAX_RING_PAGES) /** * \brief Define to force all I/O to be performed on memory owned by the * backend device, with a copy-in/out to the remote domain's memory. * * \note This option is currently required when this driver's domain is * operating in HVM mode on a system using an IOMMU. * * This driver uses Xen's grant table API to gain access to the memory of * the remote domains it serves. When our domain is operating in PV mode, * the grant table mechanism directly updates our domain's page table entries * to point to the physical pages of the remote domain. This scheme guarantees * that blkback and the backing devices it uses can safely perform DMA * operations to satisfy requests. In HVM mode, Xen may use a HW IOMMU to * insure that our domain cannot DMA to pages owned by another domain. As * of Xen 4.0, IOMMU mappings for HVM guests are not updated via the grant * table API. For this reason, in HVM mode, we must bounce all requests into * memory that is mapped into our domain at domain startup and thus has * valid IOMMU mappings. */ #define XBB_USE_BOUNCE_BUFFERS /** * \brief Define to enable rudimentary request logging to the console. */ #undef XBB_DEBUG /*---------------------------------- Macros ----------------------------------*/ /** * Custom malloc type for all driver allocations. */ static MALLOC_DEFINE(M_XENBLOCKBACK, "xbbd", "Xen Block Back Driver Data"); #ifdef XBB_DEBUG #define DPRINTF(fmt, args...) \ printf("xbb(%s:%d): " fmt, __FUNCTION__, __LINE__, ##args) #else #define DPRINTF(fmt, args...) do {} while(0) #endif /** * The maximum mapped region size per request we will allow in a negotiated * block-front/back communication channel. */ #define XBB_MAX_REQUEST_SIZE \ MIN(MAXPHYS, BLKIF_MAX_SEGMENTS_PER_REQUEST * PAGE_SIZE) /** * The maximum number of segments (within a request header and accompanying * segment blocks) per request we will allow in a negotiated block-front/back * communication channel. */ #define XBB_MAX_SEGMENTS_PER_REQUEST \ (MIN(UIO_MAXIOV, \ MIN(BLKIF_MAX_SEGMENTS_PER_REQUEST, \ (XBB_MAX_REQUEST_SIZE / PAGE_SIZE) + 1))) /** * The maximum number of ring pages that we can allow per request list. * We limit this to the maximum number of segments per request, because * that is already a reasonable number of segments to aggregate. This * number should never be smaller than XBB_MAX_SEGMENTS_PER_REQUEST, * because that would leave situations where we can't dispatch even one * large request. */ #define XBB_MAX_SEGMENTS_PER_REQLIST XBB_MAX_SEGMENTS_PER_REQUEST /*--------------------------- Forward Declarations ---------------------------*/ struct xbb_softc; struct xbb_xen_req; static void xbb_attach_failed(struct xbb_softc *xbb, int err, const char *fmt, ...) __attribute__((format(printf, 3, 4))); static int xbb_shutdown(struct xbb_softc *xbb); /*------------------------------ Data Structures -----------------------------*/ STAILQ_HEAD(xbb_xen_req_list, xbb_xen_req); typedef enum { XBB_REQLIST_NONE = 0x00, XBB_REQLIST_MAPPED = 0x01 } xbb_reqlist_flags; struct xbb_xen_reqlist { /** * Back reference to the parent block back instance for this * request. Used during bio_done handling. */ struct xbb_softc *xbb; /** * BLKIF_OP code for this request. */ int operation; /** * Set to BLKIF_RSP_* to indicate request status. * * This field allows an error status to be recorded even if the * delivery of this status must be deferred. Deferred reporting * is necessary, for example, when an error is detected during * completion processing of one bio when other bios for this * request are still outstanding. */ int status; /** * Number of 512 byte sectors not transferred. */ int residual_512b_sectors; /** * Starting sector number of the first request in the list. */ off_t starting_sector_number; /** * If we're going to coalesce, the next contiguous sector would be * this one. */ off_t next_contig_sector; /** * Number of child requests in the list. */ int num_children; /** * Number of I/O requests still pending on the backend. */ int pendcnt; /** * Total number of segments for requests in the list. */ int nr_segments; /** * Flags for this particular request list. */ xbb_reqlist_flags flags; /** * Kernel virtual address space reserved for this request * list structure and used to map the remote domain's pages for * this I/O, into our domain's address space. */ uint8_t *kva; /** * Base, pseudo-physical address, corresponding to the start * of this request's kva region. */ uint64_t gnt_base; #ifdef XBB_USE_BOUNCE_BUFFERS /** * Pre-allocated domain local memory used to proxy remote * domain memory during I/O operations. */ uint8_t *bounce; #endif /** * Array of grant handles (one per page) used to map this request. */ grant_handle_t *gnt_handles; /** * Device statistics request ordering type (ordered or simple). */ devstat_tag_type ds_tag_type; /** * Device statistics request type (read, write, no_data). */ devstat_trans_flags ds_trans_type; /** * The start time for this request. */ struct bintime ds_t0; /** * Linked list of contiguous requests with the same operation type. */ struct xbb_xen_req_list contig_req_list; /** * Linked list links used to aggregate idle requests in the * request list free pool (xbb->reqlist_free_stailq) and pending * requests waiting for execution (xbb->reqlist_pending_stailq). */ STAILQ_ENTRY(xbb_xen_reqlist) links; }; STAILQ_HEAD(xbb_xen_reqlist_list, xbb_xen_reqlist); /** * \brief Object tracking an in-flight I/O from a Xen VBD consumer. */ struct xbb_xen_req { /** * Linked list links used to aggregate requests into a reqlist * and to store them in the request free pool. */ STAILQ_ENTRY(xbb_xen_req) links; /** * The remote domain's identifier for this I/O request. */ uint64_t id; /** * The number of pages currently mapped for this request. */ int nr_pages; /** * The number of 512 byte sectors comprising this requests. */ int nr_512b_sectors; /** * BLKIF_OP code for this request. */ int operation; /** * Storage used for non-native ring requests. */ blkif_request_t ring_req_storage; /** * Pointer to the Xen request in the ring. */ blkif_request_t *ring_req; /** * Consumer index for this request. */ RING_IDX req_ring_idx; /** * The start time for this request. */ struct bintime ds_t0; /** * Pointer back to our parent request list. */ struct xbb_xen_reqlist *reqlist; }; SLIST_HEAD(xbb_xen_req_slist, xbb_xen_req); /** * \brief Configuration data for the shared memory request ring * used to communicate with the front-end client of this * this driver. */ struct xbb_ring_config { /** KVA address where ring memory is mapped. */ vm_offset_t va; /** The pseudo-physical address where ring memory is mapped.*/ uint64_t gnt_addr; /** * Grant table handles, one per-ring page, returned by the * hyperpervisor upon mapping of the ring and required to * unmap it when a connection is torn down. */ grant_handle_t handle[XBB_MAX_RING_PAGES]; /** * The device bus address returned by the hypervisor when * mapping the ring and required to unmap it when a connection * is torn down. */ uint64_t bus_addr[XBB_MAX_RING_PAGES]; /** The number of ring pages mapped for the current connection. */ u_int ring_pages; /** * The grant references, one per-ring page, supplied by the * front-end, allowing us to reference the ring pages in the * front-end's domain and to map these pages into our own domain. */ grant_ref_t ring_ref[XBB_MAX_RING_PAGES]; /** The interrupt driven even channel used to signal ring events. */ evtchn_port_t evtchn; }; /** * Per-instance connection state flags. */ typedef enum { /** * The front-end requested a read-only mount of the * back-end device/file. */ XBBF_READ_ONLY = 0x01, /** Communication with the front-end has been established. */ XBBF_RING_CONNECTED = 0x02, /** * Front-end requests exist in the ring and are waiting for * xbb_xen_req objects to free up. */ XBBF_RESOURCE_SHORTAGE = 0x04, /** Connection teardown in progress. */ XBBF_SHUTDOWN = 0x08, /** A thread is already performing shutdown processing. */ XBBF_IN_SHUTDOWN = 0x10 } xbb_flag_t; /** Backend device type. */ typedef enum { /** Backend type unknown. */ XBB_TYPE_NONE = 0x00, /** * Backend type disk (access via cdev switch * strategy routine). */ XBB_TYPE_DISK = 0x01, /** Backend type file (access vnode operations.). */ XBB_TYPE_FILE = 0x02 } xbb_type; /** * \brief Structure used to memoize information about a per-request * scatter-gather list. * * The chief benefit of using this data structure is it avoids having * to reparse the possibly discontiguous S/G list in the original * request. Due to the way that the mapping of the memory backing an * I/O transaction is handled by Xen, a second pass is unavoidable. * At least this way the second walk is a simple array traversal. * * \note A single Scatter/Gather element in the block interface covers * at most 1 machine page. In this context a sector (blkif * nomenclature, not what I'd choose) is a 512b aligned unit * of mapping within the machine page referenced by an S/G * element. */ struct xbb_sg { /** The number of 512b data chunks mapped in this S/G element. */ int16_t nsect; /** * The index (0 based) of the first 512b data chunk mapped * in this S/G element. */ uint8_t first_sect; /** * The index (0 based) of the last 512b data chunk mapped * in this S/G element. */ uint8_t last_sect; }; /** * Character device backend specific configuration data. */ struct xbb_dev_data { /** Cdev used for device backend access. */ struct cdev *cdev; /** Cdev switch used for device backend access. */ struct cdevsw *csw; /** Used to hold a reference on opened cdev backend devices. */ int dev_ref; }; /** * File backend specific configuration data. */ struct xbb_file_data { /** Credentials to use for vnode backed (file based) I/O. */ struct ucred *cred; /** * \brief Array of io vectors used to process file based I/O. * * Only a single file based request is outstanding per-xbb instance, * so we only need one of these. */ struct iovec xiovecs[XBB_MAX_SEGMENTS_PER_REQLIST]; #ifdef XBB_USE_BOUNCE_BUFFERS /** * \brief Array of io vectors used to handle bouncing of file reads. * * Vnode operations are free to modify uio data during their * exectuion. In the case of a read with bounce buffering active, * we need some of the data from the original uio in order to * bounce-out the read data. This array serves as the temporary * storage for this saved data. */ struct iovec saved_xiovecs[XBB_MAX_SEGMENTS_PER_REQLIST]; /** * \brief Array of memoized bounce buffer kva offsets used * in the file based backend. * * Due to the way that the mapping of the memory backing an * I/O transaction is handled by Xen, a second pass through * the request sg elements is unavoidable. We memoize the computed * bounce address here to reduce the cost of the second walk. */ void *xiovecs_vaddr[XBB_MAX_SEGMENTS_PER_REQLIST]; #endif /* XBB_USE_BOUNCE_BUFFERS */ }; /** * Collection of backend type specific data. */ union xbb_backend_data { struct xbb_dev_data dev; struct xbb_file_data file; }; /** * Function signature of backend specific I/O handlers. */ typedef int (*xbb_dispatch_t)(struct xbb_softc *xbb, struct xbb_xen_reqlist *reqlist, int operation, int flags); /** * Per-instance configuration data. */ struct xbb_softc { /** * Task-queue used to process I/O requests. */ struct taskqueue *io_taskqueue; /** * Single "run the request queue" task enqueued * on io_taskqueue. */ struct task io_task; /** Device type for this instance. */ xbb_type device_type; /** NewBus device corresponding to this instance. */ device_t dev; /** Backend specific dispatch routine for this instance. */ xbb_dispatch_t dispatch_io; /** The number of requests outstanding on the backend device/file. */ int active_request_count; /** Free pool of request tracking structures. */ struct xbb_xen_req_list request_free_stailq; /** Array, sized at connection time, of request tracking structures. */ struct xbb_xen_req *requests; /** Free pool of request list structures. */ struct xbb_xen_reqlist_list reqlist_free_stailq; /** List of pending request lists awaiting execution. */ struct xbb_xen_reqlist_list reqlist_pending_stailq; /** Array, sized at connection time, of request list structures. */ struct xbb_xen_reqlist *request_lists; /** * Global pool of kva used for mapping remote domain ring * and I/O transaction data. */ vm_offset_t kva; /** Pseudo-physical address corresponding to kva. */ uint64_t gnt_base_addr; /** The size of the global kva pool. */ int kva_size; /** The size of the KVA area used for request lists. */ int reqlist_kva_size; /** The number of pages of KVA used for request lists */ int reqlist_kva_pages; /** Bitmap of free KVA pages */ bitstr_t *kva_free; /** * \brief Cached value of the front-end's domain id. * * This value is used at once for each mapped page in * a transaction. We cache it to avoid incuring the * cost of an ivar access every time this is needed. */ domid_t otherend_id; /** * \brief The blkif protocol abi in effect. * * There are situations where the back and front ends can * have a different, native abi (e.g. intel x86_64 and * 32bit x86 domains on the same machine). The back-end * always accommodates the front-end's native abi. That * value is pulled from the XenStore and recorded here. */ int abi; /** * \brief The maximum number of requests and request lists allowed * to be in flight at a time. * * This value is negotiated via the XenStore. */ u_int max_requests; /** * \brief The maximum number of segments (1 page per segment) * that can be mapped by a request. * * This value is negotiated via the XenStore. */ u_int max_request_segments; /** * \brief Maximum number of segments per request list. * * This value is derived from and will generally be larger than * max_request_segments. */ u_int max_reqlist_segments; /** * The maximum size of any request to this back-end * device. * * This value is negotiated via the XenStore. */ u_int max_request_size; /** * The maximum size of any request list. This is derived directly * from max_reqlist_segments. */ u_int max_reqlist_size; /** Various configuration and state bit flags. */ xbb_flag_t flags; /** Ring mapping and interrupt configuration data. */ struct xbb_ring_config ring_config; /** Runtime, cross-abi safe, structures for ring access. */ blkif_back_rings_t rings; /** IRQ mapping for the communication ring event channel. */ xen_intr_handle_t xen_intr_handle; /** * \brief Backend access mode flags (e.g. write, or read-only). * * This value is passed to us by the front-end via the XenStore. */ char *dev_mode; /** * \brief Backend device type (e.g. "disk", "cdrom", "floppy"). * * This value is passed to us by the front-end via the XenStore. * Currently unused. */ char *dev_type; /** * \brief Backend device/file identifier. * * This value is passed to us by the front-end via the XenStore. * We expect this to be a POSIX path indicating the file or * device to open. */ char *dev_name; /** * Vnode corresponding to the backend device node or file * we are acessing. */ struct vnode *vn; union xbb_backend_data backend; /** The native sector size of the backend. */ u_int sector_size; /** log2 of sector_size. */ u_int sector_size_shift; /** Size in bytes of the backend device or file. */ off_t media_size; /** * \brief media_size expressed in terms of the backend native * sector size. * * (e.g. xbb->media_size >> xbb->sector_size_shift). */ uint64_t media_num_sectors; /** * \brief Array of memoized scatter gather data computed during the * conversion of blkif ring requests to internal xbb_xen_req * structures. * * Ring processing is serialized so we only need one of these. */ struct xbb_sg xbb_sgs[XBB_MAX_SEGMENTS_PER_REQLIST]; /** * Temporary grant table map used in xbb_dispatch_io(). When * XBB_MAX_SEGMENTS_PER_REQLIST gets large, keeping this on the * stack could cause a stack overflow. */ struct gnttab_map_grant_ref maps[XBB_MAX_SEGMENTS_PER_REQLIST]; /** Mutex protecting per-instance data. */ struct mtx lock; /** * Resource representing allocated physical address space * associated with our per-instance kva region. */ struct resource *pseudo_phys_res; /** Resource id for allocated physical address space. */ int pseudo_phys_res_id; /** * I/O statistics from BlockBack dispatch down. These are * coalesced requests, and we start them right before execution. */ struct devstat *xbb_stats; /** * I/O statistics coming into BlockBack. These are the requests as * we get them from BlockFront. They are started as soon as we * receive a request, and completed when the I/O is complete. */ struct devstat *xbb_stats_in; /** Disable sending flush to the backend */ int disable_flush; /** Send a real flush for every N flush requests */ int flush_interval; /** Count of flush requests in the interval */ int flush_count; /** Don't coalesce requests if this is set */ int no_coalesce_reqs; /** Number of requests we have received */ uint64_t reqs_received; /** Number of requests we have completed*/ uint64_t reqs_completed; /** Number of requests we queued but not pushed*/ uint64_t reqs_queued_for_completion; /** Number of requests we completed with an error status*/ uint64_t reqs_completed_with_error; /** How many forced dispatches (i.e. without coalescing) have happened */ uint64_t forced_dispatch; /** How many normal dispatches have happened */ uint64_t normal_dispatch; /** How many total dispatches have happened */ uint64_t total_dispatch; /** How many times we have run out of KVA */ uint64_t kva_shortages; /** How many times we have run out of request structures */ uint64_t request_shortages; /** Watch to wait for hotplug script execution */ struct xs_watch hotplug_watch; /** Got the needed data from hotplug scripts? */ bool hotplug_done; }; /*---------------------------- Request Processing ----------------------------*/ /** * Allocate an internal transaction tracking structure from the free pool. * * \param xbb Per-instance xbb configuration structure. * * \return On success, a pointer to the allocated xbb_xen_req structure. * Otherwise NULL. */ static inline struct xbb_xen_req * xbb_get_req(struct xbb_softc *xbb) { struct xbb_xen_req *req; req = NULL; mtx_assert(&xbb->lock, MA_OWNED); if ((req = STAILQ_FIRST(&xbb->request_free_stailq)) != NULL) { STAILQ_REMOVE_HEAD(&xbb->request_free_stailq, links); xbb->active_request_count++; } return (req); } /** * Return an allocated transaction tracking structure to the free pool. * * \param xbb Per-instance xbb configuration structure. * \param req The request structure to free. */ static inline void xbb_release_req(struct xbb_softc *xbb, struct xbb_xen_req *req) { mtx_assert(&xbb->lock, MA_OWNED); STAILQ_INSERT_HEAD(&xbb->request_free_stailq, req, links); xbb->active_request_count--; KASSERT(xbb->active_request_count >= 0, ("xbb_release_req: negative active count")); } /** * Return an xbb_xen_req_list of allocated xbb_xen_reqs to the free pool. * * \param xbb Per-instance xbb configuration structure. * \param req_list The list of requests to free. * \param nreqs The number of items in the list. */ static inline void xbb_release_reqs(struct xbb_softc *xbb, struct xbb_xen_req_list *req_list, int nreqs) { mtx_assert(&xbb->lock, MA_OWNED); STAILQ_CONCAT(&xbb->request_free_stailq, req_list); xbb->active_request_count -= nreqs; KASSERT(xbb->active_request_count >= 0, ("xbb_release_reqs: negative active count")); } /** * Given a page index and 512b sector offset within that page, * calculate an offset into a request's kva region. * * \param reqlist The request structure whose kva region will be accessed. * \param pagenr The page index used to compute the kva offset. * \param sector The 512b sector index used to compute the page relative * kva offset. * * \return The computed global KVA offset. */ static inline uint8_t * xbb_reqlist_vaddr(struct xbb_xen_reqlist *reqlist, int pagenr, int sector) { return (reqlist->kva + (PAGE_SIZE * pagenr) + (sector << 9)); } #ifdef XBB_USE_BOUNCE_BUFFERS /** * Given a page index and 512b sector offset within that page, * calculate an offset into a request's local bounce memory region. * * \param reqlist The request structure whose bounce region will be accessed. * \param pagenr The page index used to compute the bounce offset. * \param sector The 512b sector index used to compute the page relative * bounce offset. * * \return The computed global bounce buffer address. */ static inline uint8_t * xbb_reqlist_bounce_addr(struct xbb_xen_reqlist *reqlist, int pagenr, int sector) { return (reqlist->bounce + (PAGE_SIZE * pagenr) + (sector << 9)); } #endif /** * Given a page number and 512b sector offset within that page, * calculate an offset into the request's memory region that the * underlying backend device/file should use for I/O. * * \param reqlist The request structure whose I/O region will be accessed. * \param pagenr The page index used to compute the I/O offset. * \param sector The 512b sector index used to compute the page relative * I/O offset. * * \return The computed global I/O address. * * Depending on configuration, this will either be a local bounce buffer * or a pointer to the memory mapped in from the front-end domain for * this request. */ static inline uint8_t * xbb_reqlist_ioaddr(struct xbb_xen_reqlist *reqlist, int pagenr, int sector) { #ifdef XBB_USE_BOUNCE_BUFFERS return (xbb_reqlist_bounce_addr(reqlist, pagenr, sector)); #else return (xbb_reqlist_vaddr(reqlist, pagenr, sector)); #endif } /** * Given a page index and 512b sector offset within that page, calculate * an offset into the local pseudo-physical address space used to map a * front-end's request data into a request. * * \param reqlist The request list structure whose pseudo-physical region * will be accessed. * \param pagenr The page index used to compute the pseudo-physical offset. * \param sector The 512b sector index used to compute the page relative * pseudo-physical offset. * * \return The computed global pseudo-phsyical address. * * Depending on configuration, this will either be a local bounce buffer * or a pointer to the memory mapped in from the front-end domain for * this request. */ static inline uintptr_t xbb_get_gntaddr(struct xbb_xen_reqlist *reqlist, int pagenr, int sector) { struct xbb_softc *xbb; xbb = reqlist->xbb; return ((uintptr_t)(xbb->gnt_base_addr + (uintptr_t)(reqlist->kva - xbb->kva) + (PAGE_SIZE * pagenr) + (sector << 9))); } /** * Get Kernel Virtual Address space for mapping requests. * * \param xbb Per-instance xbb configuration structure. * \param nr_pages Number of pages needed. * \param check_only If set, check for free KVA but don't allocate it. * \param have_lock If set, xbb lock is already held. * * \return On success, a pointer to the allocated KVA region. Otherwise NULL. * * Note: This should be unnecessary once we have either chaining or * scatter/gather support for struct bio. At that point we'll be able to * put multiple addresses and lengths in one bio/bio chain and won't need * to map everything into one virtual segment. */ static uint8_t * xbb_get_kva(struct xbb_softc *xbb, int nr_pages) { int first_clear; int num_clear; uint8_t *free_kva; int i; KASSERT(nr_pages != 0, ("xbb_get_kva of zero length")); first_clear = 0; free_kva = NULL; mtx_lock(&xbb->lock); /* * Look for the first available page. If there are none, we're done. */ bit_ffc(xbb->kva_free, xbb->reqlist_kva_pages, &first_clear); if (first_clear == -1) goto bailout; /* * Starting at the first available page, look for consecutive free * pages that will satisfy the user's request. */ for (i = first_clear, num_clear = 0; i < xbb->reqlist_kva_pages; i++) { /* * If this is true, the page is used, so we have to reset * the number of clear pages and the first clear page * (since it pointed to a region with an insufficient number * of clear pages). */ if (bit_test(xbb->kva_free, i)) { num_clear = 0; first_clear = -1; continue; } if (first_clear == -1) first_clear = i; /* * If this is true, we've found a large enough free region * to satisfy the request. */ if (++num_clear == nr_pages) { bit_nset(xbb->kva_free, first_clear, first_clear + nr_pages - 1); free_kva = xbb->kva + (uint8_t *)((intptr_t)first_clear * PAGE_SIZE); KASSERT(free_kva >= (uint8_t *)xbb->kva && free_kva + (nr_pages * PAGE_SIZE) <= (uint8_t *)xbb->ring_config.va, ("Free KVA %p len %d out of range, " "kva = %#jx, ring VA = %#jx\n", free_kva, nr_pages * PAGE_SIZE, (uintmax_t)xbb->kva, (uintmax_t)xbb->ring_config.va)); break; } } bailout: if (free_kva == NULL) { xbb->flags |= XBBF_RESOURCE_SHORTAGE; xbb->kva_shortages++; } mtx_unlock(&xbb->lock); return (free_kva); } /** * Free allocated KVA. * * \param xbb Per-instance xbb configuration structure. * \param kva_ptr Pointer to allocated KVA region. * \param nr_pages Number of pages in the KVA region. */ static void xbb_free_kva(struct xbb_softc *xbb, uint8_t *kva_ptr, int nr_pages) { intptr_t start_page; mtx_assert(&xbb->lock, MA_OWNED); start_page = (intptr_t)(kva_ptr - xbb->kva) >> PAGE_SHIFT; bit_nclear(xbb->kva_free, start_page, start_page + nr_pages - 1); } /** * Unmap the front-end pages associated with this I/O request. * * \param req The request structure to unmap. */ static void xbb_unmap_reqlist(struct xbb_xen_reqlist *reqlist) { struct gnttab_unmap_grant_ref unmap[XBB_MAX_SEGMENTS_PER_REQLIST]; u_int i; u_int invcount; int error; invcount = 0; for (i = 0; i < reqlist->nr_segments; i++) { if (reqlist->gnt_handles[i] == GRANT_REF_INVALID) continue; unmap[invcount].host_addr = xbb_get_gntaddr(reqlist, i, 0); unmap[invcount].dev_bus_addr = 0; unmap[invcount].handle = reqlist->gnt_handles[i]; reqlist->gnt_handles[i] = GRANT_REF_INVALID; invcount++; } error = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, unmap, invcount); KASSERT(error == 0, ("Grant table operation failed")); } /** * Allocate an internal transaction tracking structure from the free pool. * * \param xbb Per-instance xbb configuration structure. * * \return On success, a pointer to the allocated xbb_xen_reqlist structure. * Otherwise NULL. */ static inline struct xbb_xen_reqlist * xbb_get_reqlist(struct xbb_softc *xbb) { struct xbb_xen_reqlist *reqlist; reqlist = NULL; mtx_assert(&xbb->lock, MA_OWNED); if ((reqlist = STAILQ_FIRST(&xbb->reqlist_free_stailq)) != NULL) { STAILQ_REMOVE_HEAD(&xbb->reqlist_free_stailq, links); reqlist->flags = XBB_REQLIST_NONE; reqlist->kva = NULL; reqlist->status = BLKIF_RSP_OKAY; reqlist->residual_512b_sectors = 0; reqlist->num_children = 0; reqlist->nr_segments = 0; STAILQ_INIT(&reqlist->contig_req_list); } return (reqlist); } /** * Return an allocated transaction tracking structure to the free pool. * * \param xbb Per-instance xbb configuration structure. * \param req The request list structure to free. * \param wakeup If set, wakeup the work thread if freeing this reqlist * during a resource shortage condition. */ static inline void xbb_release_reqlist(struct xbb_softc *xbb, struct xbb_xen_reqlist *reqlist, int wakeup) { mtx_assert(&xbb->lock, MA_OWNED); if (wakeup) { wakeup = xbb->flags & XBBF_RESOURCE_SHORTAGE; xbb->flags &= ~XBBF_RESOURCE_SHORTAGE; } if (reqlist->kva != NULL) xbb_free_kva(xbb, reqlist->kva, reqlist->nr_segments); xbb_release_reqs(xbb, &reqlist->contig_req_list, reqlist->num_children); STAILQ_INSERT_TAIL(&xbb->reqlist_free_stailq, reqlist, links); if ((xbb->flags & XBBF_SHUTDOWN) != 0) { /* * Shutdown is in progress. See if we can * progress further now that one more request * has completed and been returned to the * free pool. */ xbb_shutdown(xbb); } if (wakeup != 0) taskqueue_enqueue(xbb->io_taskqueue, &xbb->io_task); } /** * Request resources and do basic request setup. * * \param xbb Per-instance xbb configuration structure. * \param reqlist Pointer to reqlist pointer. * \param ring_req Pointer to a block ring request. * \param ring_index The ring index of this request. * * \return 0 for success, non-zero for failure. */ static int xbb_get_resources(struct xbb_softc *xbb, struct xbb_xen_reqlist **reqlist, blkif_request_t *ring_req, RING_IDX ring_idx) { struct xbb_xen_reqlist *nreqlist; struct xbb_xen_req *nreq; nreqlist = NULL; nreq = NULL; mtx_lock(&xbb->lock); /* * We don't allow new resources to be allocated if we're in the * process of shutting down. */ if ((xbb->flags & XBBF_SHUTDOWN) != 0) { mtx_unlock(&xbb->lock); return (1); } /* * Allocate a reqlist if the caller doesn't have one already. */ if (*reqlist == NULL) { nreqlist = xbb_get_reqlist(xbb); if (nreqlist == NULL) goto bailout_error; } /* We always allocate a request. */ nreq = xbb_get_req(xbb); if (nreq == NULL) goto bailout_error; mtx_unlock(&xbb->lock); if (*reqlist == NULL) { *reqlist = nreqlist; nreqlist->operation = ring_req->operation; nreqlist->starting_sector_number = ring_req->sector_number; STAILQ_INSERT_TAIL(&xbb->reqlist_pending_stailq, nreqlist, links); } nreq->reqlist = *reqlist; nreq->req_ring_idx = ring_idx; nreq->id = ring_req->id; nreq->operation = ring_req->operation; if (xbb->abi != BLKIF_PROTOCOL_NATIVE) { bcopy(ring_req, &nreq->ring_req_storage, sizeof(*ring_req)); nreq->ring_req = &nreq->ring_req_storage; } else { nreq->ring_req = ring_req; } binuptime(&nreq->ds_t0); devstat_start_transaction(xbb->xbb_stats_in, &nreq->ds_t0); STAILQ_INSERT_TAIL(&(*reqlist)->contig_req_list, nreq, links); (*reqlist)->num_children++; (*reqlist)->nr_segments += ring_req->nr_segments; return (0); bailout_error: /* * We're out of resources, so set the shortage flag. The next time * a request is released, we'll try waking up the work thread to * see if we can allocate more resources. */ xbb->flags |= XBBF_RESOURCE_SHORTAGE; xbb->request_shortages++; if (nreq != NULL) xbb_release_req(xbb, nreq); if (nreqlist != NULL) xbb_release_reqlist(xbb, nreqlist, /*wakeup*/ 0); mtx_unlock(&xbb->lock); return (1); } /** * Create and queue a response to a blkif request. * * \param xbb Per-instance xbb configuration structure. * \param req The request structure to which to respond. * \param status The status code to report. See BLKIF_RSP_* * in sys/xen/interface/io/blkif.h. */ static void xbb_queue_response(struct xbb_softc *xbb, struct xbb_xen_req *req, int status) { blkif_response_t *resp; /* * The mutex is required here, and should be held across this call * until after the subsequent call to xbb_push_responses(). This * is to guarantee that another context won't queue responses and * push them while we're active. * * That could lead to the other end being notified of responses * before the resources have been freed on this end. The other end * would then be able to queue additional I/O, and we may run out * of resources because we haven't freed them all yet. */ mtx_assert(&xbb->lock, MA_OWNED); /* * Place on the response ring for the relevant domain. * For now, only the spacing between entries is different * in the different ABIs, not the response entry layout. */ switch (xbb->abi) { case BLKIF_PROTOCOL_NATIVE: resp = RING_GET_RESPONSE(&xbb->rings.native, xbb->rings.native.rsp_prod_pvt); break; case BLKIF_PROTOCOL_X86_32: resp = (blkif_response_t *) RING_GET_RESPONSE(&xbb->rings.x86_32, xbb->rings.x86_32.rsp_prod_pvt); break; case BLKIF_PROTOCOL_X86_64: resp = (blkif_response_t *) RING_GET_RESPONSE(&xbb->rings.x86_64, xbb->rings.x86_64.rsp_prod_pvt); break; default: panic("Unexpected blkif protocol ABI."); } resp->id = req->id; resp->operation = req->operation; resp->status = status; if (status != BLKIF_RSP_OKAY) xbb->reqs_completed_with_error++; xbb->rings.common.rsp_prod_pvt++; xbb->reqs_queued_for_completion++; } /** * Send queued responses to blkif requests. * * \param xbb Per-instance xbb configuration structure. * \param run_taskqueue Flag that is set to 1 if the taskqueue * should be run, 0 if it does not need to be run. * \param notify Flag that is set to 1 if the other end should be * notified via irq, 0 if the other end should not be * notified. */ static void xbb_push_responses(struct xbb_softc *xbb, int *run_taskqueue, int *notify) { int more_to_do; /* * The mutex is required here. */ mtx_assert(&xbb->lock, MA_OWNED); more_to_do = 0; RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&xbb->rings.common, *notify); if (xbb->rings.common.rsp_prod_pvt == xbb->rings.common.req_cons) { /* * Tail check for pending requests. Allows frontend to avoid * notifications if requests are already in flight (lower * overheads and promotes batching). */ RING_FINAL_CHECK_FOR_REQUESTS(&xbb->rings.common, more_to_do); } else if (RING_HAS_UNCONSUMED_REQUESTS(&xbb->rings.common)) { more_to_do = 1; } xbb->reqs_completed += xbb->reqs_queued_for_completion; xbb->reqs_queued_for_completion = 0; *run_taskqueue = more_to_do; } /** * Complete a request list. * * \param xbb Per-instance xbb configuration structure. * \param reqlist Allocated internal request list structure. */ static void xbb_complete_reqlist(struct xbb_softc *xbb, struct xbb_xen_reqlist *reqlist) { struct xbb_xen_req *nreq; off_t sectors_sent; int notify, run_taskqueue; sectors_sent = 0; if (reqlist->flags & XBB_REQLIST_MAPPED) xbb_unmap_reqlist(reqlist); mtx_lock(&xbb->lock); /* * All I/O is done, send the response. A lock is not necessary * to protect the request list, because all requests have * completed. Therefore this is the only context accessing this * reqlist right now. However, in order to make sure that no one * else queues responses onto the queue or pushes them to the other * side while we're active, we need to hold the lock across the * calls to xbb_queue_response() and xbb_push_responses(). */ STAILQ_FOREACH(nreq, &reqlist->contig_req_list, links) { off_t cur_sectors_sent; /* Put this response on the ring, but don't push yet */ xbb_queue_response(xbb, nreq, reqlist->status); /* We don't report bytes sent if there is an error. */ if (reqlist->status == BLKIF_RSP_OKAY) cur_sectors_sent = nreq->nr_512b_sectors; else cur_sectors_sent = 0; sectors_sent += cur_sectors_sent; devstat_end_transaction(xbb->xbb_stats_in, /*bytes*/cur_sectors_sent << 9, reqlist->ds_tag_type, reqlist->ds_trans_type, /*now*/NULL, /*then*/&nreq->ds_t0); } /* * Take out any sectors not sent. If we wind up negative (which * might happen if an error is reported as well as a residual), just * report 0 sectors sent. */ sectors_sent -= reqlist->residual_512b_sectors; if (sectors_sent < 0) sectors_sent = 0; devstat_end_transaction(xbb->xbb_stats, /*bytes*/ sectors_sent << 9, reqlist->ds_tag_type, reqlist->ds_trans_type, /*now*/NULL, /*then*/&reqlist->ds_t0); xbb_release_reqlist(xbb, reqlist, /*wakeup*/ 1); xbb_push_responses(xbb, &run_taskqueue, ¬ify); mtx_unlock(&xbb->lock); if (run_taskqueue) taskqueue_enqueue(xbb->io_taskqueue, &xbb->io_task); if (notify) xen_intr_signal(xbb->xen_intr_handle); } /** * Completion handler for buffer I/O requests issued by the device * backend driver. * * \param bio The buffer I/O request on which to perform completion * processing. */ static void xbb_bio_done(struct bio *bio) { struct xbb_softc *xbb; struct xbb_xen_reqlist *reqlist; reqlist = bio->bio_caller1; xbb = reqlist->xbb; reqlist->residual_512b_sectors += bio->bio_resid >> 9; /* * This is a bit imprecise. With aggregated I/O a single * request list can contain multiple front-end requests and * a multiple bios may point to a single request. By carefully * walking the request list, we could map residuals and errors * back to the original front-end request, but the interface * isn't sufficiently rich for us to properly report the error. * So, we just treat the entire request list as having failed if an * error occurs on any part. And, if an error occurs, we treat * the amount of data transferred as 0. * * For residuals, we report it on the overall aggregated device, * but not on the individual requests, since we don't currently * do the work to determine which front-end request to which the * residual applies. */ if (bio->bio_error) { DPRINTF("BIO returned error %d for operation on device %s\n", bio->bio_error, xbb->dev_name); reqlist->status = BLKIF_RSP_ERROR; if (bio->bio_error == ENXIO && xenbus_get_state(xbb->dev) == XenbusStateConnected) { /* * Backend device has disappeared. Signal the * front-end that we (the device proxy) want to * go away. */ xenbus_set_state(xbb->dev, XenbusStateClosing); } } #ifdef XBB_USE_BOUNCE_BUFFERS if (bio->bio_cmd == BIO_READ) { vm_offset_t kva_offset; kva_offset = (vm_offset_t)bio->bio_data - (vm_offset_t)reqlist->bounce; memcpy((uint8_t *)reqlist->kva + kva_offset, bio->bio_data, bio->bio_bcount); } #endif /* XBB_USE_BOUNCE_BUFFERS */ /* * Decrement the pending count for the request list. When we're * done with the requests, send status back for all of them. */ if (atomic_fetchadd_int(&reqlist->pendcnt, -1) == 1) xbb_complete_reqlist(xbb, reqlist); g_destroy_bio(bio); } /** * Parse a blkif request into an internal request structure and send * it to the backend for processing. * * \param xbb Per-instance xbb configuration structure. * \param reqlist Allocated internal request list structure. * * \return On success, 0. For resource shortages, non-zero. * * This routine performs the backend common aspects of request parsing * including compiling an internal request structure, parsing the S/G * list and any secondary ring requests in which they may reside, and * the mapping of front-end I/O pages into our domain. */ static int xbb_dispatch_io(struct xbb_softc *xbb, struct xbb_xen_reqlist *reqlist) { struct xbb_sg *xbb_sg; struct gnttab_map_grant_ref *map; struct blkif_request_segment *sg; struct blkif_request_segment *last_block_sg; struct xbb_xen_req *nreq; u_int nseg; u_int seg_idx; u_int block_segs; int nr_sects; int total_sects; int operation; uint8_t bio_flags; int error; reqlist->ds_tag_type = DEVSTAT_TAG_SIMPLE; bio_flags = 0; total_sects = 0; nr_sects = 0; /* * First determine whether we have enough free KVA to satisfy this * request list. If not, tell xbb_run_queue() so it can go to * sleep until we have more KVA. */ reqlist->kva = NULL; if (reqlist->nr_segments != 0) { reqlist->kva = xbb_get_kva(xbb, reqlist->nr_segments); if (reqlist->kva == NULL) { /* * If we're out of KVA, return ENOMEM. */ return (ENOMEM); } } binuptime(&reqlist->ds_t0); devstat_start_transaction(xbb->xbb_stats, &reqlist->ds_t0); switch (reqlist->operation) { case BLKIF_OP_WRITE_BARRIER: bio_flags |= BIO_ORDERED; reqlist->ds_tag_type = DEVSTAT_TAG_ORDERED; /* FALLTHROUGH */ case BLKIF_OP_WRITE: operation = BIO_WRITE; reqlist->ds_trans_type = DEVSTAT_WRITE; if ((xbb->flags & XBBF_READ_ONLY) != 0) { DPRINTF("Attempt to write to read only device %s\n", xbb->dev_name); reqlist->status = BLKIF_RSP_ERROR; goto send_response; } break; case BLKIF_OP_READ: operation = BIO_READ; reqlist->ds_trans_type = DEVSTAT_READ; break; case BLKIF_OP_FLUSH_DISKCACHE: /* * If this is true, the user has requested that we disable * flush support. So we just complete the requests * successfully. */ if (xbb->disable_flush != 0) { goto send_response; } /* * The user has requested that we only send a real flush * for every N flush requests. So keep count, and either * complete the request immediately or queue it for the * backend. */ if (xbb->flush_interval != 0) { if (++(xbb->flush_count) < xbb->flush_interval) { goto send_response; } else xbb->flush_count = 0; } operation = BIO_FLUSH; reqlist->ds_tag_type = DEVSTAT_TAG_ORDERED; reqlist->ds_trans_type = DEVSTAT_NO_DATA; goto do_dispatch; /*NOTREACHED*/ default: DPRINTF("error: unknown block io operation [%d]\n", reqlist->operation); reqlist->status = BLKIF_RSP_ERROR; goto send_response; } reqlist->xbb = xbb; xbb_sg = xbb->xbb_sgs; map = xbb->maps; seg_idx = 0; STAILQ_FOREACH(nreq, &reqlist->contig_req_list, links) { blkif_request_t *ring_req; RING_IDX req_ring_idx; u_int req_seg_idx; ring_req = nreq->ring_req; req_ring_idx = nreq->req_ring_idx; nr_sects = 0; nseg = ring_req->nr_segments; nreq->nr_pages = nseg; nreq->nr_512b_sectors = 0; req_seg_idx = 0; sg = NULL; /* Check that number of segments is sane. */ if (__predict_false(nseg == 0) || __predict_false(nseg > xbb->max_request_segments)) { DPRINTF("Bad number of segments in request (%d)\n", nseg); reqlist->status = BLKIF_RSP_ERROR; goto send_response; } block_segs = nseg; sg = ring_req->seg; last_block_sg = sg + block_segs; while (sg < last_block_sg) { KASSERT(seg_idx < XBB_MAX_SEGMENTS_PER_REQLIST, ("seg_idx %d is too large, max " "segs %d\n", seg_idx, XBB_MAX_SEGMENTS_PER_REQLIST)); xbb_sg->first_sect = sg->first_sect; xbb_sg->last_sect = sg->last_sect; xbb_sg->nsect = (int8_t)(sg->last_sect - sg->first_sect + 1); if ((sg->last_sect >= (PAGE_SIZE >> 9)) || (xbb_sg->nsect <= 0)) { reqlist->status = BLKIF_RSP_ERROR; goto send_response; } nr_sects += xbb_sg->nsect; map->host_addr = xbb_get_gntaddr(reqlist, seg_idx, /*sector*/0); KASSERT(map->host_addr + PAGE_SIZE <= xbb->ring_config.gnt_addr, ("Host address %#jx len %d overlaps " "ring address %#jx\n", (uintmax_t)map->host_addr, PAGE_SIZE, (uintmax_t)xbb->ring_config.gnt_addr)); map->flags = GNTMAP_host_map; map->ref = sg->gref; map->dom = xbb->otherend_id; if (operation == BIO_WRITE) map->flags |= GNTMAP_readonly; sg++; map++; xbb_sg++; seg_idx++; req_seg_idx++; } /* Convert to the disk's sector size */ nreq->nr_512b_sectors = nr_sects; nr_sects = (nr_sects << 9) >> xbb->sector_size_shift; total_sects += nr_sects; if ((nreq->nr_512b_sectors & ((xbb->sector_size >> 9) - 1)) != 0) { device_printf(xbb->dev, "%s: I/O size (%d) is not " "a multiple of the backing store sector " "size (%d)\n", __func__, nreq->nr_512b_sectors << 9, xbb->sector_size); reqlist->status = BLKIF_RSP_ERROR; goto send_response; } } error = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, xbb->maps, reqlist->nr_segments); if (error != 0) panic("Grant table operation failed (%d)", error); reqlist->flags |= XBB_REQLIST_MAPPED; for (seg_idx = 0, map = xbb->maps; seg_idx < reqlist->nr_segments; seg_idx++, map++){ if (__predict_false(map->status != 0)) { DPRINTF("invalid buffer -- could not remap " "it (%d)\n", map->status); DPRINTF("Mapping(%d): Host Addr 0x%"PRIx64", flags " "0x%x ref 0x%x, dom %d\n", seg_idx, map->host_addr, map->flags, map->ref, map->dom); reqlist->status = BLKIF_RSP_ERROR; goto send_response; } reqlist->gnt_handles[seg_idx] = map->handle; } if (reqlist->starting_sector_number + total_sects > xbb->media_num_sectors) { DPRINTF("%s of [%" PRIu64 ",%" PRIu64 "] " "extends past end of device %s\n", operation == BIO_READ ? "read" : "write", reqlist->starting_sector_number, reqlist->starting_sector_number + total_sects, xbb->dev_name); reqlist->status = BLKIF_RSP_ERROR; goto send_response; } do_dispatch: error = xbb->dispatch_io(xbb, reqlist, operation, bio_flags); if (error != 0) { reqlist->status = BLKIF_RSP_ERROR; goto send_response; } return (0); send_response: xbb_complete_reqlist(xbb, reqlist); return (0); } static __inline int xbb_count_sects(blkif_request_t *ring_req) { int i; int cur_size = 0; for (i = 0; i < ring_req->nr_segments; i++) { int nsect; nsect = (int8_t)(ring_req->seg[i].last_sect - ring_req->seg[i].first_sect + 1); if (nsect <= 0) break; cur_size += nsect; } return (cur_size); } /** * Process incoming requests from the shared communication ring in response * to a signal on the ring's event channel. * * \param context Callback argument registerd during task initialization - * the xbb_softc for this instance. * \param pending The number of taskqueue_enqueue events that have * occurred since this handler was last run. */ static void xbb_run_queue(void *context, int pending) { struct xbb_softc *xbb; blkif_back_rings_t *rings; RING_IDX rp; uint64_t cur_sector; int cur_operation; struct xbb_xen_reqlist *reqlist; xbb = (struct xbb_softc *)context; rings = &xbb->rings; /* * Work gather and dispatch loop. Note that we have a bias here * towards gathering I/O sent by blockfront. We first gather up * everything in the ring, as long as we have resources. Then we * dispatch one request, and then attempt to gather up any * additional requests that have come in while we were dispatching * the request. * * This allows us to get a clearer picture (via devstat) of how * many requests blockfront is queueing to us at any given time. */ for (;;) { int retval; /* * Initialize reqlist to the last element in the pending * queue, if there is one. This allows us to add more * requests to that request list, if we have room. */ reqlist = STAILQ_LAST(&xbb->reqlist_pending_stailq, xbb_xen_reqlist, links); if (reqlist != NULL) { cur_sector = reqlist->next_contig_sector; cur_operation = reqlist->operation; } else { cur_operation = 0; cur_sector = 0; } /* * Cache req_prod to avoid accessing a cache line shared * with the frontend. */ rp = rings->common.sring->req_prod; /* Ensure we see queued requests up to 'rp'. */ rmb(); /** * Run so long as there is work to consume and the generation * of a response will not overflow the ring. * * @note There's a 1 to 1 relationship between requests and * responses, so an overflow should never occur. This * test is to protect our domain from digesting bogus * data. Shouldn't we log this? */ while (rings->common.req_cons != rp && RING_REQUEST_CONS_OVERFLOW(&rings->common, rings->common.req_cons) == 0){ blkif_request_t ring_req_storage; blkif_request_t *ring_req; int cur_size; switch (xbb->abi) { case BLKIF_PROTOCOL_NATIVE: ring_req = RING_GET_REQUEST(&xbb->rings.native, rings->common.req_cons); break; case BLKIF_PROTOCOL_X86_32: { struct blkif_x86_32_request *ring_req32; ring_req32 = RING_GET_REQUEST( &xbb->rings.x86_32, rings->common.req_cons); blkif_get_x86_32_req(&ring_req_storage, ring_req32); ring_req = &ring_req_storage; break; } case BLKIF_PROTOCOL_X86_64: { struct blkif_x86_64_request *ring_req64; ring_req64 =RING_GET_REQUEST(&xbb->rings.x86_64, rings->common.req_cons); blkif_get_x86_64_req(&ring_req_storage, ring_req64); ring_req = &ring_req_storage; break; } default: panic("Unexpected blkif protocol ABI."); /* NOTREACHED */ } /* * Check for situations that would require closing * off this I/O for further coalescing: * - Coalescing is turned off. * - Current I/O is out of sequence with the previous * I/O. * - Coalesced I/O would be too large. */ if ((reqlist != NULL) && ((xbb->no_coalesce_reqs != 0) || ((xbb->no_coalesce_reqs == 0) && ((ring_req->sector_number != cur_sector) || (ring_req->operation != cur_operation) || ((ring_req->nr_segments + reqlist->nr_segments) > xbb->max_reqlist_segments))))) { reqlist = NULL; } /* * Grab and check for all resources in one shot. * If we can't get all of the resources we need, * the shortage is noted and the thread will get * woken up when more resources are available. */ retval = xbb_get_resources(xbb, &reqlist, ring_req, xbb->rings.common.req_cons); if (retval != 0) { /* * Resource shortage has been recorded. * We'll be scheduled to run once a request * object frees up due to a completion. */ break; } /* * Signify that we can overwrite this request with * a response by incrementing our consumer index. * The response won't be generated until after * we've already consumed all necessary data out * of the version of the request in the ring buffer * (for native mode). We must update the consumer * index before issuing back-end I/O so there is * no possibility that it will complete and a * response be generated before we make room in * the queue for that response. */ xbb->rings.common.req_cons++; xbb->reqs_received++; cur_size = xbb_count_sects(ring_req); cur_sector = ring_req->sector_number + cur_size; reqlist->next_contig_sector = cur_sector; cur_operation = ring_req->operation; } /* Check for I/O to dispatch */ reqlist = STAILQ_FIRST(&xbb->reqlist_pending_stailq); if (reqlist == NULL) { /* * We're out of work to do, put the task queue to * sleep. */ break; } /* * Grab the first request off the queue and attempt * to dispatch it. */ STAILQ_REMOVE_HEAD(&xbb->reqlist_pending_stailq, links); retval = xbb_dispatch_io(xbb, reqlist); if (retval != 0) { /* * xbb_dispatch_io() returns non-zero only when * there is a resource shortage. If that's the * case, re-queue this request on the head of the * queue, and go to sleep until we have more * resources. */ STAILQ_INSERT_HEAD(&xbb->reqlist_pending_stailq, reqlist, links); break; } else { /* * If we still have anything on the queue after * removing the head entry, that is because we * met one of the criteria to create a new * request list (outlined above), and we'll call * that a forced dispatch for statistical purposes. * * Otherwise, if there is only one element on the * queue, we coalesced everything available on * the ring and we'll call that a normal dispatch. */ reqlist = STAILQ_FIRST(&xbb->reqlist_pending_stailq); if (reqlist != NULL) xbb->forced_dispatch++; else xbb->normal_dispatch++; xbb->total_dispatch++; } } } /** * Interrupt handler bound to the shared ring's event channel. * * \param arg Callback argument registerd during event channel * binding - the xbb_softc for this instance. */ static int xbb_filter(void *arg) { struct xbb_softc *xbb; /* Defer to taskqueue thread. */ xbb = (struct xbb_softc *)arg; taskqueue_enqueue(xbb->io_taskqueue, &xbb->io_task); return (FILTER_HANDLED); } SDT_PROVIDER_DEFINE(xbb); SDT_PROBE_DEFINE1(xbb, kernel, xbb_dispatch_dev, flush, "int"); SDT_PROBE_DEFINE3(xbb, kernel, xbb_dispatch_dev, read, "int", "uint64_t", "uint64_t"); SDT_PROBE_DEFINE3(xbb, kernel, xbb_dispatch_dev, write, "int", "uint64_t", "uint64_t"); /*----------------------------- Backend Handlers -----------------------------*/ /** * Backend handler for character device access. * * \param xbb Per-instance xbb configuration structure. * \param reqlist Allocated internal request list structure. * \param operation BIO_* I/O operation code. * \param bio_flags Additional bio_flag data to pass to any generated * bios (e.g. BIO_ORDERED).. * * \return 0 for success, errno codes for failure. */ static int xbb_dispatch_dev(struct xbb_softc *xbb, struct xbb_xen_reqlist *reqlist, int operation, int bio_flags) { struct xbb_dev_data *dev_data; struct bio *bios[XBB_MAX_SEGMENTS_PER_REQLIST]; off_t bio_offset; struct bio *bio; struct xbb_sg *xbb_sg; u_int nbio; u_int bio_idx; u_int nseg; u_int seg_idx; int error; dev_data = &xbb->backend.dev; bio_offset = (off_t)reqlist->starting_sector_number << xbb->sector_size_shift; error = 0; nbio = 0; bio_idx = 0; if (operation == BIO_FLUSH) { bio = g_new_bio(); if (__predict_false(bio == NULL)) { DPRINTF("Unable to allocate bio for BIO_FLUSH\n"); error = ENOMEM; return (error); } bio->bio_cmd = BIO_FLUSH; bio->bio_flags |= BIO_ORDERED; bio->bio_dev = dev_data->cdev; bio->bio_offset = 0; bio->bio_data = 0; bio->bio_done = xbb_bio_done; bio->bio_caller1 = reqlist; bio->bio_pblkno = 0; reqlist->pendcnt = 1; SDT_PROBE1(xbb, kernel, xbb_dispatch_dev, flush, device_get_unit(xbb->dev)); (*dev_data->csw->d_strategy)(bio); return (0); } xbb_sg = xbb->xbb_sgs; bio = NULL; nseg = reqlist->nr_segments; for (seg_idx = 0; seg_idx < nseg; seg_idx++, xbb_sg++) { /* * KVA will not be contiguous, so any additional * I/O will need to be represented in a new bio. */ if ((bio != NULL) && (xbb_sg->first_sect != 0)) { if ((bio->bio_length & (xbb->sector_size - 1)) != 0) { printf("%s: Discontiguous I/O request " "from domain %d ends on " "non-sector boundary\n", __func__, xbb->otherend_id); error = EINVAL; goto fail_free_bios; } bio = NULL; } if (bio == NULL) { /* * Make sure that the start of this bio is * aligned to a device sector. */ if ((bio_offset & (xbb->sector_size - 1)) != 0){ printf("%s: Misaligned I/O request " "from domain %d\n", __func__, xbb->otherend_id); error = EINVAL; goto fail_free_bios; } bio = bios[nbio++] = g_new_bio(); if (__predict_false(bio == NULL)) { error = ENOMEM; goto fail_free_bios; } bio->bio_cmd = operation; bio->bio_flags |= bio_flags; bio->bio_dev = dev_data->cdev; bio->bio_offset = bio_offset; bio->bio_data = xbb_reqlist_ioaddr(reqlist, seg_idx, xbb_sg->first_sect); bio->bio_done = xbb_bio_done; bio->bio_caller1 = reqlist; bio->bio_pblkno = bio_offset >> xbb->sector_size_shift; } bio->bio_length += xbb_sg->nsect << 9; bio->bio_bcount = bio->bio_length; bio_offset += xbb_sg->nsect << 9; if (xbb_sg->last_sect != (PAGE_SIZE - 512) >> 9) { if ((bio->bio_length & (xbb->sector_size - 1)) != 0) { printf("%s: Discontiguous I/O request " "from domain %d ends on " "non-sector boundary\n", __func__, xbb->otherend_id); error = EINVAL; goto fail_free_bios; } /* * KVA will not be contiguous, so any additional * I/O will need to be represented in a new bio. */ bio = NULL; } } reqlist->pendcnt = nbio; for (bio_idx = 0; bio_idx < nbio; bio_idx++) { #ifdef XBB_USE_BOUNCE_BUFFERS vm_offset_t kva_offset; kva_offset = (vm_offset_t)bios[bio_idx]->bio_data - (vm_offset_t)reqlist->bounce; if (operation == BIO_WRITE) { memcpy(bios[bio_idx]->bio_data, (uint8_t *)reqlist->kva + kva_offset, bios[bio_idx]->bio_bcount); } #endif if (operation == BIO_READ) { SDT_PROBE3(xbb, kernel, xbb_dispatch_dev, read, device_get_unit(xbb->dev), bios[bio_idx]->bio_offset, bios[bio_idx]->bio_length); } else if (operation == BIO_WRITE) { SDT_PROBE3(xbb, kernel, xbb_dispatch_dev, write, device_get_unit(xbb->dev), bios[bio_idx]->bio_offset, bios[bio_idx]->bio_length); } (*dev_data->csw->d_strategy)(bios[bio_idx]); } return (error); fail_free_bios: for (bio_idx = 0; bio_idx < (nbio-1); bio_idx++) g_destroy_bio(bios[bio_idx]); return (error); } SDT_PROBE_DEFINE1(xbb, kernel, xbb_dispatch_file, flush, "int"); SDT_PROBE_DEFINE3(xbb, kernel, xbb_dispatch_file, read, "int", "uint64_t", "uint64_t"); SDT_PROBE_DEFINE3(xbb, kernel, xbb_dispatch_file, write, "int", "uint64_t", "uint64_t"); /** * Backend handler for file access. * * \param xbb Per-instance xbb configuration structure. * \param reqlist Allocated internal request list. * \param operation BIO_* I/O operation code. * \param flags Additional bio_flag data to pass to any generated bios * (e.g. BIO_ORDERED).. * * \return 0 for success, errno codes for failure. */ static int xbb_dispatch_file(struct xbb_softc *xbb, struct xbb_xen_reqlist *reqlist, int operation, int flags) { struct xbb_file_data *file_data; u_int seg_idx; u_int nseg; struct uio xuio; struct xbb_sg *xbb_sg; struct iovec *xiovec; #ifdef XBB_USE_BOUNCE_BUFFERS void **p_vaddr; int saved_uio_iovcnt; #endif /* XBB_USE_BOUNCE_BUFFERS */ int error; file_data = &xbb->backend.file; error = 0; bzero(&xuio, sizeof(xuio)); switch (operation) { case BIO_READ: xuio.uio_rw = UIO_READ; break; case BIO_WRITE: xuio.uio_rw = UIO_WRITE; break; case BIO_FLUSH: { struct mount *mountpoint; SDT_PROBE1(xbb, kernel, xbb_dispatch_file, flush, device_get_unit(xbb->dev)); (void) vn_start_write(xbb->vn, &mountpoint, V_WAIT); vn_lock(xbb->vn, LK_EXCLUSIVE | LK_RETRY); error = VOP_FSYNC(xbb->vn, MNT_WAIT, curthread); VOP_UNLOCK(xbb->vn); vn_finished_write(mountpoint); goto bailout_send_response; /* NOTREACHED */ } default: panic("invalid operation %d", operation); /* NOTREACHED */ } xuio.uio_offset = (vm_offset_t)reqlist->starting_sector_number << xbb->sector_size_shift; xuio.uio_segflg = UIO_SYSSPACE; xuio.uio_iov = file_data->xiovecs; xuio.uio_iovcnt = 0; xbb_sg = xbb->xbb_sgs; nseg = reqlist->nr_segments; for (xiovec = NULL, seg_idx = 0; seg_idx < nseg; seg_idx++, xbb_sg++) { /* * If the first sector is not 0, the KVA will * not be contiguous and we'll need to go on * to another segment. */ if (xbb_sg->first_sect != 0) xiovec = NULL; if (xiovec == NULL) { xiovec = &file_data->xiovecs[xuio.uio_iovcnt]; xiovec->iov_base = xbb_reqlist_ioaddr(reqlist, seg_idx, xbb_sg->first_sect); #ifdef XBB_USE_BOUNCE_BUFFERS /* * Store the address of the incoming * buffer at this particular offset * as well, so we can do the copy * later without having to do more * work to recalculate this address. */ p_vaddr = &file_data->xiovecs_vaddr[xuio.uio_iovcnt]; *p_vaddr = xbb_reqlist_vaddr(reqlist, seg_idx, xbb_sg->first_sect); #endif /* XBB_USE_BOUNCE_BUFFERS */ xiovec->iov_len = 0; xuio.uio_iovcnt++; } xiovec->iov_len += xbb_sg->nsect << 9; xuio.uio_resid += xbb_sg->nsect << 9; /* * If the last sector is not the full page * size count, the next segment will not be * contiguous in KVA and we need a new iovec. */ if (xbb_sg->last_sect != (PAGE_SIZE - 512) >> 9) xiovec = NULL; } xuio.uio_td = curthread; #ifdef XBB_USE_BOUNCE_BUFFERS saved_uio_iovcnt = xuio.uio_iovcnt; if (operation == BIO_WRITE) { /* Copy the write data to the local buffer. */ for (seg_idx = 0, p_vaddr = file_data->xiovecs_vaddr, xiovec = xuio.uio_iov; seg_idx < xuio.uio_iovcnt; seg_idx++, xiovec++, p_vaddr++) { memcpy(xiovec->iov_base, *p_vaddr, xiovec->iov_len); } } else { /* * We only need to save off the iovecs in the case of a * read, because the copy for the read happens after the * VOP_READ(). (The uio will get modified in that call * sequence.) */ memcpy(file_data->saved_xiovecs, xuio.uio_iov, xuio.uio_iovcnt * sizeof(xuio.uio_iov[0])); } #endif /* XBB_USE_BOUNCE_BUFFERS */ switch (operation) { case BIO_READ: SDT_PROBE3(xbb, kernel, xbb_dispatch_file, read, device_get_unit(xbb->dev), xuio.uio_offset, xuio.uio_resid); vn_lock(xbb->vn, LK_EXCLUSIVE | LK_RETRY); /* * UFS pays attention to IO_DIRECT for reads. If the * DIRECTIO option is configured into the kernel, it calls * ffs_rawread(). But that only works for single-segment * uios with user space addresses. In our case, with a * kernel uio, it still reads into the buffer cache, but it * will just try to release the buffer from the cache later * on in ffs_read(). * * ZFS does not pay attention to IO_DIRECT for reads. * * UFS does not pay attention to IO_SYNC for reads. * * ZFS pays attention to IO_SYNC (which translates into the * Solaris define FRSYNC for zfs_read()) for reads. It * attempts to sync the file before reading. * * So, to attempt to provide some barrier semantics in the * BIO_ORDERED case, set both IO_DIRECT and IO_SYNC. */ error = VOP_READ(xbb->vn, &xuio, (flags & BIO_ORDERED) ? (IO_DIRECT|IO_SYNC) : 0, file_data->cred); VOP_UNLOCK(xbb->vn); break; case BIO_WRITE: { struct mount *mountpoint; SDT_PROBE3(xbb, kernel, xbb_dispatch_file, write, device_get_unit(xbb->dev), xuio.uio_offset, xuio.uio_resid); (void)vn_start_write(xbb->vn, &mountpoint, V_WAIT); vn_lock(xbb->vn, LK_EXCLUSIVE | LK_RETRY); /* * UFS pays attention to IO_DIRECT for writes. The write * is done asynchronously. (Normally the write would just * get put into cache. * * UFS pays attention to IO_SYNC for writes. It will * attempt to write the buffer out synchronously if that * flag is set. * * ZFS does not pay attention to IO_DIRECT for writes. * * ZFS pays attention to IO_SYNC (a.k.a. FSYNC or FRSYNC) * for writes. It will flush the transaction from the * cache before returning. * * So if we've got the BIO_ORDERED flag set, we want * IO_SYNC in either the UFS or ZFS case. */ error = VOP_WRITE(xbb->vn, &xuio, (flags & BIO_ORDERED) ? IO_SYNC : 0, file_data->cred); VOP_UNLOCK(xbb->vn); vn_finished_write(mountpoint); break; } default: panic("invalid operation %d", operation); /* NOTREACHED */ } #ifdef XBB_USE_BOUNCE_BUFFERS /* We only need to copy here for read operations */ if (operation == BIO_READ) { for (seg_idx = 0, p_vaddr = file_data->xiovecs_vaddr, xiovec = file_data->saved_xiovecs; seg_idx < saved_uio_iovcnt; seg_idx++, xiovec++, p_vaddr++) { /* * Note that we have to use the copy of the * io vector we made above. uiomove() modifies * the uio and its referenced vector as uiomove * performs the copy, so we can't rely on any * state from the original uio. */ memcpy(*p_vaddr, xiovec->iov_base, xiovec->iov_len); } } #endif /* XBB_USE_BOUNCE_BUFFERS */ bailout_send_response: if (error != 0) reqlist->status = BLKIF_RSP_ERROR; xbb_complete_reqlist(xbb, reqlist); return (0); } /*--------------------------- Backend Configuration --------------------------*/ /** * Close and cleanup any backend device/file specific state for this * block back instance. * * \param xbb Per-instance xbb configuration structure. */ static void xbb_close_backend(struct xbb_softc *xbb) { DROP_GIANT(); DPRINTF("closing dev=%s\n", xbb->dev_name); if (xbb->vn) { int flags = FREAD; if ((xbb->flags & XBBF_READ_ONLY) == 0) flags |= FWRITE; switch (xbb->device_type) { case XBB_TYPE_DISK: if (xbb->backend.dev.csw) { dev_relthread(xbb->backend.dev.cdev, xbb->backend.dev.dev_ref); xbb->backend.dev.csw = NULL; xbb->backend.dev.cdev = NULL; } break; case XBB_TYPE_FILE: break; case XBB_TYPE_NONE: default: panic("Unexpected backend type."); break; } (void)vn_close(xbb->vn, flags, NOCRED, curthread); xbb->vn = NULL; switch (xbb->device_type) { case XBB_TYPE_DISK: break; case XBB_TYPE_FILE: if (xbb->backend.file.cred != NULL) { crfree(xbb->backend.file.cred); xbb->backend.file.cred = NULL; } break; case XBB_TYPE_NONE: default: panic("Unexpected backend type."); break; } } PICKUP_GIANT(); } /** * Open a character device to be used for backend I/O. * * \param xbb Per-instance xbb configuration structure. * * \return 0 for success, errno codes for failure. */ static int xbb_open_dev(struct xbb_softc *xbb) { struct vattr vattr; struct cdev *dev; struct cdevsw *devsw; int error; xbb->device_type = XBB_TYPE_DISK; xbb->dispatch_io = xbb_dispatch_dev; xbb->backend.dev.cdev = xbb->vn->v_rdev; xbb->backend.dev.csw = dev_refthread(xbb->backend.dev.cdev, &xbb->backend.dev.dev_ref); if (xbb->backend.dev.csw == NULL) panic("Unable to retrieve device switch"); error = VOP_GETATTR(xbb->vn, &vattr, NOCRED); if (error) { xenbus_dev_fatal(xbb->dev, error, "error getting " "vnode attributes for device %s", xbb->dev_name); return (error); } dev = xbb->vn->v_rdev; devsw = dev->si_devsw; if (!devsw->d_ioctl) { xenbus_dev_fatal(xbb->dev, ENODEV, "no d_ioctl for " "device %s!", xbb->dev_name); return (ENODEV); } error = devsw->d_ioctl(dev, DIOCGSECTORSIZE, (caddr_t)&xbb->sector_size, FREAD, curthread); if (error) { xenbus_dev_fatal(xbb->dev, error, "error calling ioctl DIOCGSECTORSIZE " "for device %s", xbb->dev_name); return (error); } error = devsw->d_ioctl(dev, DIOCGMEDIASIZE, (caddr_t)&xbb->media_size, FREAD, curthread); if (error) { xenbus_dev_fatal(xbb->dev, error, "error calling ioctl DIOCGMEDIASIZE " "for device %s", xbb->dev_name); return (error); } return (0); } /** * Open a file to be used for backend I/O. * * \param xbb Per-instance xbb configuration structure. * * \return 0 for success, errno codes for failure. */ static int xbb_open_file(struct xbb_softc *xbb) { struct xbb_file_data *file_data; struct vattr vattr; int error; file_data = &xbb->backend.file; xbb->device_type = XBB_TYPE_FILE; xbb->dispatch_io = xbb_dispatch_file; error = VOP_GETATTR(xbb->vn, &vattr, curthread->td_ucred); if (error != 0) { xenbus_dev_fatal(xbb->dev, error, "error calling VOP_GETATTR()" "for file %s", xbb->dev_name); return (error); } /* * Verify that we have the ability to upgrade to exclusive * access on this file so we can trap errors at open instead * of reporting them during first access. */ if (VOP_ISLOCKED(xbb->vn) != LK_EXCLUSIVE) { vn_lock(xbb->vn, LK_UPGRADE | LK_RETRY); if (VN_IS_DOOMED(xbb->vn)) { error = EBADF; xenbus_dev_fatal(xbb->dev, error, "error locking file %s", xbb->dev_name); return (error); } } file_data->cred = crhold(curthread->td_ucred); xbb->media_size = vattr.va_size; /* * XXX KDM vattr.va_blocksize may be larger than 512 bytes here. * With ZFS, it is 131072 bytes. Block sizes that large don't work * with disklabel and UFS on FreeBSD at least. Large block sizes * may not work with other OSes as well. So just export a sector * size of 512 bytes, which should work with any OS or * application. Since our backing is a file, any block size will * work fine for the backing store. */ #if 0 xbb->sector_size = vattr.va_blocksize; #endif xbb->sector_size = 512; /* * Sanity check. The media size has to be at least one * sector long. */ if (xbb->media_size < xbb->sector_size) { error = EINVAL; xenbus_dev_fatal(xbb->dev, error, "file %s size %ju < block size %u", xbb->dev_name, (uintmax_t)xbb->media_size, xbb->sector_size); } return (error); } /** * Open the backend provider for this connection. * * \param xbb Per-instance xbb configuration structure. * * \return 0 for success, errno codes for failure. */ static int xbb_open_backend(struct xbb_softc *xbb) { struct nameidata nd; int flags; int error; flags = FREAD; error = 0; DPRINTF("opening dev=%s\n", xbb->dev_name); if (rootvnode == NULL) { xenbus_dev_fatal(xbb->dev, ENOENT, "Root file system not mounted"); return (ENOENT); } if ((xbb->flags & XBBF_READ_ONLY) == 0) flags |= FWRITE; pwd_ensure_dirs(); again: NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, xbb->dev_name, curthread); error = vn_open(&nd, &flags, 0, NULL); if (error) { /* * This is the only reasonable guess we can make as far as * path if the user doesn't give us a fully qualified path. * If they want to specify a file, they need to specify the * full path. */ if (xbb->dev_name[0] != '/') { char *dev_path = "/dev/"; char *dev_name; /* Try adding device path at beginning of name */ dev_name = malloc(strlen(xbb->dev_name) + strlen(dev_path) + 1, M_XENBLOCKBACK, M_NOWAIT); if (dev_name) { sprintf(dev_name, "%s%s", dev_path, xbb->dev_name); free(xbb->dev_name, M_XENBLOCKBACK); xbb->dev_name = dev_name; goto again; } } xenbus_dev_fatal(xbb->dev, error, "error opening device %s", xbb->dev_name); return (error); } NDFREE(&nd, NDF_ONLY_PNBUF); xbb->vn = nd.ni_vp; /* We only support disks and files. */ - if (vn_isdisk(xbb->vn, &error)) { + if (vn_isdisk_error(xbb->vn, &error)) { error = xbb_open_dev(xbb); } else if (xbb->vn->v_type == VREG) { error = xbb_open_file(xbb); } else { error = EINVAL; xenbus_dev_fatal(xbb->dev, error, "%s is not a disk " "or file", xbb->dev_name); } VOP_UNLOCK(xbb->vn); if (error != 0) { xbb_close_backend(xbb); return (error); } xbb->sector_size_shift = fls(xbb->sector_size) - 1; xbb->media_num_sectors = xbb->media_size >> xbb->sector_size_shift; DPRINTF("opened %s=%s sector_size=%u media_size=%" PRId64 "\n", (xbb->device_type == XBB_TYPE_DISK) ? "dev" : "file", xbb->dev_name, xbb->sector_size, xbb->media_size); return (0); } /*------------------------ Inter-Domain Communication ------------------------*/ /** * Free dynamically allocated KVA or pseudo-physical address allocations. * * \param xbb Per-instance xbb configuration structure. */ static void xbb_free_communication_mem(struct xbb_softc *xbb) { if (xbb->kva != 0) { if (xbb->pseudo_phys_res != NULL) { xenmem_free(xbb->dev, xbb->pseudo_phys_res_id, xbb->pseudo_phys_res); xbb->pseudo_phys_res = NULL; } } xbb->kva = 0; xbb->gnt_base_addr = 0; if (xbb->kva_free != NULL) { free(xbb->kva_free, M_XENBLOCKBACK); xbb->kva_free = NULL; } } /** * Cleanup all inter-domain communication mechanisms. * * \param xbb Per-instance xbb configuration structure. */ static int xbb_disconnect(struct xbb_softc *xbb) { struct gnttab_unmap_grant_ref ops[XBB_MAX_RING_PAGES]; struct gnttab_unmap_grant_ref *op; u_int ring_idx; int error; DPRINTF("\n"); if ((xbb->flags & XBBF_RING_CONNECTED) == 0) return (0); mtx_unlock(&xbb->lock); xen_intr_unbind(&xbb->xen_intr_handle); taskqueue_drain(xbb->io_taskqueue, &xbb->io_task); mtx_lock(&xbb->lock); /* * No new interrupts can generate work, but we must wait * for all currently active requests to drain. */ if (xbb->active_request_count != 0) return (EAGAIN); for (ring_idx = 0, op = ops; ring_idx < xbb->ring_config.ring_pages; ring_idx++, op++) { op->host_addr = xbb->ring_config.gnt_addr + (ring_idx * PAGE_SIZE); op->dev_bus_addr = xbb->ring_config.bus_addr[ring_idx]; op->handle = xbb->ring_config.handle[ring_idx]; } error = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, ops, xbb->ring_config.ring_pages); if (error != 0) panic("Grant table op failed (%d)", error); xbb_free_communication_mem(xbb); if (xbb->requests != NULL) { free(xbb->requests, M_XENBLOCKBACK); xbb->requests = NULL; } if (xbb->request_lists != NULL) { struct xbb_xen_reqlist *reqlist; int i; /* There is one request list for ever allocated request. */ for (i = 0, reqlist = xbb->request_lists; i < xbb->max_requests; i++, reqlist++){ #ifdef XBB_USE_BOUNCE_BUFFERS if (reqlist->bounce != NULL) { free(reqlist->bounce, M_XENBLOCKBACK); reqlist->bounce = NULL; } #endif if (reqlist->gnt_handles != NULL) { free(reqlist->gnt_handles, M_XENBLOCKBACK); reqlist->gnt_handles = NULL; } } free(xbb->request_lists, M_XENBLOCKBACK); xbb->request_lists = NULL; } xbb->flags &= ~XBBF_RING_CONNECTED; return (0); } /** * Map shared memory ring into domain local address space, initialize * ring control structures, and bind an interrupt to the event channel * used to notify us of ring changes. * * \param xbb Per-instance xbb configuration structure. */ static int xbb_connect_ring(struct xbb_softc *xbb) { struct gnttab_map_grant_ref gnts[XBB_MAX_RING_PAGES]; struct gnttab_map_grant_ref *gnt; u_int ring_idx; int error; if ((xbb->flags & XBBF_RING_CONNECTED) != 0) return (0); /* * Kva for our ring is at the tail of the region of kva allocated * by xbb_alloc_communication_mem(). */ xbb->ring_config.va = xbb->kva + (xbb->kva_size - (xbb->ring_config.ring_pages * PAGE_SIZE)); xbb->ring_config.gnt_addr = xbb->gnt_base_addr + (xbb->kva_size - (xbb->ring_config.ring_pages * PAGE_SIZE)); for (ring_idx = 0, gnt = gnts; ring_idx < xbb->ring_config.ring_pages; ring_idx++, gnt++) { gnt->host_addr = xbb->ring_config.gnt_addr + (ring_idx * PAGE_SIZE); gnt->flags = GNTMAP_host_map; gnt->ref = xbb->ring_config.ring_ref[ring_idx]; gnt->dom = xbb->otherend_id; } error = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, gnts, xbb->ring_config.ring_pages); if (error) panic("blkback: Ring page grant table op failed (%d)", error); for (ring_idx = 0, gnt = gnts; ring_idx < xbb->ring_config.ring_pages; ring_idx++, gnt++) { if (gnt->status != 0) { xbb->ring_config.va = 0; xenbus_dev_fatal(xbb->dev, EACCES, "Ring shared page mapping failed. " "Status %d.", gnt->status); return (EACCES); } xbb->ring_config.handle[ring_idx] = gnt->handle; xbb->ring_config.bus_addr[ring_idx] = gnt->dev_bus_addr; } /* Initialize the ring based on ABI. */ switch (xbb->abi) { case BLKIF_PROTOCOL_NATIVE: { blkif_sring_t *sring; sring = (blkif_sring_t *)xbb->ring_config.va; BACK_RING_INIT(&xbb->rings.native, sring, xbb->ring_config.ring_pages * PAGE_SIZE); break; } case BLKIF_PROTOCOL_X86_32: { blkif_x86_32_sring_t *sring_x86_32; sring_x86_32 = (blkif_x86_32_sring_t *)xbb->ring_config.va; BACK_RING_INIT(&xbb->rings.x86_32, sring_x86_32, xbb->ring_config.ring_pages * PAGE_SIZE); break; } case BLKIF_PROTOCOL_X86_64: { blkif_x86_64_sring_t *sring_x86_64; sring_x86_64 = (blkif_x86_64_sring_t *)xbb->ring_config.va; BACK_RING_INIT(&xbb->rings.x86_64, sring_x86_64, xbb->ring_config.ring_pages * PAGE_SIZE); break; } default: panic("Unexpected blkif protocol ABI."); } xbb->flags |= XBBF_RING_CONNECTED; error = xen_intr_bind_remote_port(xbb->dev, xbb->otherend_id, xbb->ring_config.evtchn, xbb_filter, /*ithread_handler*/NULL, /*arg*/xbb, INTR_TYPE_BIO | INTR_MPSAFE, &xbb->xen_intr_handle); if (error) { (void)xbb_disconnect(xbb); xenbus_dev_fatal(xbb->dev, error, "binding event channel"); return (error); } DPRINTF("rings connected!\n"); return 0; } /** * Size KVA and pseudo-physical address allocations based on negotiated * values for the size and number of I/O requests, and the size of our * communication ring. * * \param xbb Per-instance xbb configuration structure. * * These address spaces are used to dynamically map pages in the * front-end's domain into our own. */ static int xbb_alloc_communication_mem(struct xbb_softc *xbb) { xbb->reqlist_kva_pages = xbb->max_requests * xbb->max_request_segments; xbb->reqlist_kva_size = xbb->reqlist_kva_pages * PAGE_SIZE; xbb->kva_size = xbb->reqlist_kva_size + (xbb->ring_config.ring_pages * PAGE_SIZE); xbb->kva_free = bit_alloc(xbb->reqlist_kva_pages, M_XENBLOCKBACK, M_NOWAIT); if (xbb->kva_free == NULL) return (ENOMEM); DPRINTF("%s: kva_size = %d, reqlist_kva_size = %d\n", device_get_nameunit(xbb->dev), xbb->kva_size, xbb->reqlist_kva_size); /* * Reserve a range of pseudo physical memory that we can map * into kva. These pages will only be backed by machine * pages ("real memory") during the lifetime of front-end requests * via grant table operations. */ xbb->pseudo_phys_res_id = 0; xbb->pseudo_phys_res = xenmem_alloc(xbb->dev, &xbb->pseudo_phys_res_id, xbb->kva_size); if (xbb->pseudo_phys_res == NULL) { xbb->kva = 0; return (ENOMEM); } xbb->kva = (vm_offset_t)rman_get_virtual(xbb->pseudo_phys_res); xbb->gnt_base_addr = rman_get_start(xbb->pseudo_phys_res); DPRINTF("%s: kva: %#jx, gnt_base_addr: %#jx\n", device_get_nameunit(xbb->dev), (uintmax_t)xbb->kva, (uintmax_t)xbb->gnt_base_addr); return (0); } /** * Collect front-end information from the XenStore. * * \param xbb Per-instance xbb configuration structure. */ static int xbb_collect_frontend_info(struct xbb_softc *xbb) { char protocol_abi[64]; const char *otherend_path; int error; u_int ring_idx; u_int ring_page_order; size_t ring_size; otherend_path = xenbus_get_otherend_path(xbb->dev); /* * Protocol defaults valid even if all negotiation fails. */ xbb->ring_config.ring_pages = 1; xbb->max_request_segments = BLKIF_MAX_SEGMENTS_PER_REQUEST; xbb->max_request_size = xbb->max_request_segments * PAGE_SIZE; /* * Mandatory data (used in all versions of the protocol) first. */ error = xs_scanf(XST_NIL, otherend_path, "event-channel", NULL, "%" PRIu32, &xbb->ring_config.evtchn); if (error != 0) { xenbus_dev_fatal(xbb->dev, error, "Unable to retrieve event-channel information " "from frontend %s. Unable to connect.", xenbus_get_otherend_path(xbb->dev)); return (error); } /* * These fields are initialized to legacy protocol defaults * so we only need to fail if reading the updated value succeeds * and the new value is outside of its allowed range. * * \note xs_gather() returns on the first encountered error, so * we must use independent calls in order to guarantee * we don't miss information in a sparsly populated front-end * tree. * * \note xs_scanf() does not update variables for unmatched * fields. */ ring_page_order = 0; xbb->max_requests = 32; (void)xs_scanf(XST_NIL, otherend_path, "ring-page-order", NULL, "%u", &ring_page_order); xbb->ring_config.ring_pages = 1 << ring_page_order; ring_size = PAGE_SIZE * xbb->ring_config.ring_pages; xbb->max_requests = BLKIF_MAX_RING_REQUESTS(ring_size); if (xbb->ring_config.ring_pages > XBB_MAX_RING_PAGES) { xenbus_dev_fatal(xbb->dev, EINVAL, "Front-end specified ring-pages of %u " "exceeds backend limit of %u. " "Unable to connect.", xbb->ring_config.ring_pages, XBB_MAX_RING_PAGES); return (EINVAL); } if (xbb->ring_config.ring_pages == 1) { error = xs_gather(XST_NIL, otherend_path, "ring-ref", "%" PRIu32, &xbb->ring_config.ring_ref[0], NULL); if (error != 0) { xenbus_dev_fatal(xbb->dev, error, "Unable to retrieve ring information " "from frontend %s. Unable to " "connect.", xenbus_get_otherend_path(xbb->dev)); return (error); } } else { /* Multi-page ring format. */ for (ring_idx = 0; ring_idx < xbb->ring_config.ring_pages; ring_idx++) { char ring_ref_name[]= "ring_refXX"; snprintf(ring_ref_name, sizeof(ring_ref_name), "ring-ref%u", ring_idx); error = xs_scanf(XST_NIL, otherend_path, ring_ref_name, NULL, "%" PRIu32, &xbb->ring_config.ring_ref[ring_idx]); if (error != 0) { xenbus_dev_fatal(xbb->dev, error, "Failed to retriev grant " "reference for page %u of " "shared ring. Unable " "to connect.", ring_idx); return (error); } } } error = xs_gather(XST_NIL, otherend_path, "protocol", "%63s", protocol_abi, NULL); if (error != 0 || !strcmp(protocol_abi, XEN_IO_PROTO_ABI_NATIVE)) { /* * Assume native if the frontend has not * published ABI data or it has published and * matches our own ABI. */ xbb->abi = BLKIF_PROTOCOL_NATIVE; } else if (!strcmp(protocol_abi, XEN_IO_PROTO_ABI_X86_32)) { xbb->abi = BLKIF_PROTOCOL_X86_32; } else if (!strcmp(protocol_abi, XEN_IO_PROTO_ABI_X86_64)) { xbb->abi = BLKIF_PROTOCOL_X86_64; } else { xenbus_dev_fatal(xbb->dev, EINVAL, "Unknown protocol ABI (%s) published by " "frontend. Unable to connect.", protocol_abi); return (EINVAL); } return (0); } /** * Allocate per-request data structures given request size and number * information negotiated with the front-end. * * \param xbb Per-instance xbb configuration structure. */ static int xbb_alloc_requests(struct xbb_softc *xbb) { struct xbb_xen_req *req; struct xbb_xen_req *last_req; /* * Allocate request book keeping datastructures. */ xbb->requests = malloc(xbb->max_requests * sizeof(*xbb->requests), M_XENBLOCKBACK, M_NOWAIT|M_ZERO); if (xbb->requests == NULL) { xenbus_dev_fatal(xbb->dev, ENOMEM, "Unable to allocate request structures"); return (ENOMEM); } req = xbb->requests; last_req = &xbb->requests[xbb->max_requests - 1]; STAILQ_INIT(&xbb->request_free_stailq); while (req <= last_req) { STAILQ_INSERT_TAIL(&xbb->request_free_stailq, req, links); req++; } return (0); } static int xbb_alloc_request_lists(struct xbb_softc *xbb) { struct xbb_xen_reqlist *reqlist; int i; /* * If no requests can be merged, we need 1 request list per * in flight request. */ xbb->request_lists = malloc(xbb->max_requests * sizeof(*xbb->request_lists), M_XENBLOCKBACK, M_NOWAIT|M_ZERO); if (xbb->request_lists == NULL) { xenbus_dev_fatal(xbb->dev, ENOMEM, "Unable to allocate request list structures"); return (ENOMEM); } STAILQ_INIT(&xbb->reqlist_free_stailq); STAILQ_INIT(&xbb->reqlist_pending_stailq); for (i = 0; i < xbb->max_requests; i++) { int seg; reqlist = &xbb->request_lists[i]; reqlist->xbb = xbb; #ifdef XBB_USE_BOUNCE_BUFFERS reqlist->bounce = malloc(xbb->max_reqlist_size, M_XENBLOCKBACK, M_NOWAIT); if (reqlist->bounce == NULL) { xenbus_dev_fatal(xbb->dev, ENOMEM, "Unable to allocate request " "bounce buffers"); return (ENOMEM); } #endif /* XBB_USE_BOUNCE_BUFFERS */ reqlist->gnt_handles = malloc(xbb->max_reqlist_segments * sizeof(*reqlist->gnt_handles), M_XENBLOCKBACK, M_NOWAIT|M_ZERO); if (reqlist->gnt_handles == NULL) { xenbus_dev_fatal(xbb->dev, ENOMEM, "Unable to allocate request " "grant references"); return (ENOMEM); } for (seg = 0; seg < xbb->max_reqlist_segments; seg++) reqlist->gnt_handles[seg] = GRANT_REF_INVALID; STAILQ_INSERT_TAIL(&xbb->reqlist_free_stailq, reqlist, links); } return (0); } /** * Supply information about the physical device to the frontend * via XenBus. * * \param xbb Per-instance xbb configuration structure. */ static int xbb_publish_backend_info(struct xbb_softc *xbb) { struct xs_transaction xst; const char *our_path; const char *leaf; int error; our_path = xenbus_get_node(xbb->dev); while (1) { error = xs_transaction_start(&xst); if (error != 0) { xenbus_dev_fatal(xbb->dev, error, "Error publishing backend info " "(start transaction)"); return (error); } leaf = "sectors"; error = xs_printf(xst, our_path, leaf, "%"PRIu64, xbb->media_num_sectors); if (error != 0) break; /* XXX Support all VBD attributes here. */ leaf = "info"; error = xs_printf(xst, our_path, leaf, "%u", xbb->flags & XBBF_READ_ONLY ? VDISK_READONLY : 0); if (error != 0) break; leaf = "sector-size"; error = xs_printf(xst, our_path, leaf, "%u", xbb->sector_size); if (error != 0) break; error = xs_transaction_end(xst, 0); if (error == 0) { return (0); } else if (error != EAGAIN) { xenbus_dev_fatal(xbb->dev, error, "ending transaction"); return (error); } } xenbus_dev_fatal(xbb->dev, error, "writing %s/%s", our_path, leaf); xs_transaction_end(xst, 1); return (error); } /** * Connect to our blkfront peer now that it has completed publishing * its configuration into the XenStore. * * \param xbb Per-instance xbb configuration structure. */ static void xbb_connect(struct xbb_softc *xbb) { int error; if (!xbb->hotplug_done || (xenbus_get_state(xbb->dev) != XenbusStateInitWait) || (xbb_collect_frontend_info(xbb) != 0)) return; xbb->flags &= ~XBBF_SHUTDOWN; /* * We limit the maximum number of reqlist segments to the maximum * number of segments in the ring, or our absolute maximum, * whichever is smaller. */ xbb->max_reqlist_segments = MIN(xbb->max_request_segments * xbb->max_requests, XBB_MAX_SEGMENTS_PER_REQLIST); /* * The maximum size is simply a function of the number of segments * we can handle. */ xbb->max_reqlist_size = xbb->max_reqlist_segments * PAGE_SIZE; /* Allocate resources whose size depends on front-end configuration. */ error = xbb_alloc_communication_mem(xbb); if (error != 0) { xenbus_dev_fatal(xbb->dev, error, "Unable to allocate communication memory"); return; } error = xbb_alloc_requests(xbb); if (error != 0) { /* Specific errors are reported by xbb_alloc_requests(). */ return; } error = xbb_alloc_request_lists(xbb); if (error != 0) { /* Specific errors are reported by xbb_alloc_request_lists(). */ return; } /* * Connect communication channel. */ error = xbb_connect_ring(xbb); if (error != 0) { /* Specific errors are reported by xbb_connect_ring(). */ return; } if (xbb_publish_backend_info(xbb) != 0) { /* * If we can't publish our data, we cannot participate * in this connection, and waiting for a front-end state * change will not help the situation. */ (void)xbb_disconnect(xbb); return; } /* Ready for I/O. */ xenbus_set_state(xbb->dev, XenbusStateConnected); } /*-------------------------- Device Teardown Support -------------------------*/ /** * Perform device shutdown functions. * * \param xbb Per-instance xbb configuration structure. * * Mark this instance as shutting down, wait for any active I/O on the * backend device/file to drain, disconnect from the front-end, and notify * any waiters (e.g. a thread invoking our detach method) that detach can * now proceed. */ static int xbb_shutdown(struct xbb_softc *xbb) { XenbusState frontState; int error; DPRINTF("\n"); /* * Due to the need to drop our mutex during some * xenbus operations, it is possible for two threads * to attempt to close out shutdown processing at * the same time. Tell the caller that hits this * race to try back later. */ if ((xbb->flags & XBBF_IN_SHUTDOWN) != 0) return (EAGAIN); xbb->flags |= XBBF_IN_SHUTDOWN; mtx_unlock(&xbb->lock); if (xbb->hotplug_watch.node != NULL) { xs_unregister_watch(&xbb->hotplug_watch); free(xbb->hotplug_watch.node, M_XENBLOCKBACK); xbb->hotplug_watch.node = NULL; } xbb->hotplug_done = false; if (xenbus_get_state(xbb->dev) < XenbusStateClosing) xenbus_set_state(xbb->dev, XenbusStateClosing); frontState = xenbus_get_otherend_state(xbb->dev); mtx_lock(&xbb->lock); xbb->flags &= ~XBBF_IN_SHUTDOWN; /* Wait for the frontend to disconnect (if it's connected). */ if (frontState == XenbusStateConnected) return (EAGAIN); DPRINTF("\n"); /* Indicate shutdown is in progress. */ xbb->flags |= XBBF_SHUTDOWN; /* Disconnect from the front-end. */ error = xbb_disconnect(xbb); if (error != 0) { /* * Requests still outstanding. We'll be called again * once they complete. */ KASSERT(error == EAGAIN, ("%s: Unexpected xbb_disconnect() failure %d", __func__, error)); return (error); } DPRINTF("\n"); /* Indicate to xbb_detach() that is it safe to proceed. */ wakeup(xbb); return (0); } /** * Report an attach time error to the console and Xen, and cleanup * this instance by forcing immediate detach processing. * * \param xbb Per-instance xbb configuration structure. * \param err Errno describing the error. * \param fmt Printf style format and arguments */ static void xbb_attach_failed(struct xbb_softc *xbb, int err, const char *fmt, ...) { va_list ap; va_list ap_hotplug; va_start(ap, fmt); va_copy(ap_hotplug, ap); xs_vprintf(XST_NIL, xenbus_get_node(xbb->dev), "hotplug-error", fmt, ap_hotplug); va_end(ap_hotplug); xs_printf(XST_NIL, xenbus_get_node(xbb->dev), "hotplug-status", "error"); xenbus_dev_vfatal(xbb->dev, err, fmt, ap); va_end(ap); xs_printf(XST_NIL, xenbus_get_node(xbb->dev), "online", "0"); mtx_lock(&xbb->lock); xbb_shutdown(xbb); mtx_unlock(&xbb->lock); } /*---------------------------- NewBus Entrypoints ----------------------------*/ /** * Inspect a XenBus device and claim it if is of the appropriate type. * * \param dev NewBus device object representing a candidate XenBus device. * * \return 0 for success, errno codes for failure. */ static int xbb_probe(device_t dev) { if (!strcmp(xenbus_get_type(dev), "vbd")) { device_set_desc(dev, "Backend Virtual Block Device"); device_quiet(dev); return (0); } return (ENXIO); } /** * Setup sysctl variables to control various Block Back parameters. * * \param xbb Xen Block Back softc. * */ static void xbb_setup_sysctl(struct xbb_softc *xbb) { struct sysctl_ctx_list *sysctl_ctx = NULL; struct sysctl_oid *sysctl_tree = NULL; sysctl_ctx = device_get_sysctl_ctx(xbb->dev); if (sysctl_ctx == NULL) return; sysctl_tree = device_get_sysctl_tree(xbb->dev); if (sysctl_tree == NULL) return; SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, "disable_flush", CTLFLAG_RW, &xbb->disable_flush, 0, "fake the flush command"); SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, "flush_interval", CTLFLAG_RW, &xbb->flush_interval, 0, "send a real flush for N flush requests"); SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, "no_coalesce_reqs", CTLFLAG_RW, &xbb->no_coalesce_reqs,0, "Don't coalesce contiguous requests"); SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, "reqs_received", CTLFLAG_RW, &xbb->reqs_received, "how many I/O requests we have received"); SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, "reqs_completed", CTLFLAG_RW, &xbb->reqs_completed, "how many I/O requests have been completed"); SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, "reqs_queued_for_completion", CTLFLAG_RW, &xbb->reqs_queued_for_completion, "how many I/O requests queued but not yet pushed"); SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, "reqs_completed_with_error", CTLFLAG_RW, &xbb->reqs_completed_with_error, "how many I/O requests completed with error status"); SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, "forced_dispatch", CTLFLAG_RW, &xbb->forced_dispatch, "how many I/O dispatches were forced"); SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, "normal_dispatch", CTLFLAG_RW, &xbb->normal_dispatch, "how many I/O dispatches were normal"); SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, "total_dispatch", CTLFLAG_RW, &xbb->total_dispatch, "total number of I/O dispatches"); SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, "kva_shortages", CTLFLAG_RW, &xbb->kva_shortages, "how many times we have run out of KVA"); SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, "request_shortages", CTLFLAG_RW, &xbb->request_shortages, "how many times we have run out of requests"); SYSCTL_ADD_UINT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, "max_requests", CTLFLAG_RD, &xbb->max_requests, 0, "maximum outstanding requests (negotiated)"); SYSCTL_ADD_UINT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, "max_request_segments", CTLFLAG_RD, &xbb->max_request_segments, 0, "maximum number of pages per requests (negotiated)"); SYSCTL_ADD_UINT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, "max_request_size", CTLFLAG_RD, &xbb->max_request_size, 0, "maximum size in bytes of a request (negotiated)"); SYSCTL_ADD_UINT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, "ring_pages", CTLFLAG_RD, &xbb->ring_config.ring_pages, 0, "communication channel pages (negotiated)"); } static void xbb_attach_disk(struct xs_watch *watch, const char **vec, unsigned int len) { device_t dev; struct xbb_softc *xbb; int error; dev = (device_t) watch->callback_data; xbb = device_get_softc(dev); error = xs_gather(XST_NIL, xenbus_get_node(dev), "physical-device-path", NULL, &xbb->dev_name, NULL); if (error != 0) return; xs_unregister_watch(watch); free(watch->node, M_XENBLOCKBACK); watch->node = NULL; /* Collect physical device information. */ error = xs_gather(XST_NIL, xenbus_get_otherend_path(xbb->dev), "device-type", NULL, &xbb->dev_type, NULL); if (error != 0) xbb->dev_type = NULL; error = xs_gather(XST_NIL, xenbus_get_node(dev), "mode", NULL, &xbb->dev_mode, NULL); if (error != 0) { xbb_attach_failed(xbb, error, "reading backend fields at %s", xenbus_get_node(dev)); return; } /* Parse fopen style mode flags. */ if (strchr(xbb->dev_mode, 'w') == NULL) xbb->flags |= XBBF_READ_ONLY; /* * Verify the physical device is present and can support * the desired I/O mode. */ error = xbb_open_backend(xbb); if (error != 0) { xbb_attach_failed(xbb, error, "Unable to open %s", xbb->dev_name); return; } /* Use devstat(9) for recording statistics. */ xbb->xbb_stats = devstat_new_entry("xbb", device_get_unit(xbb->dev), xbb->sector_size, DEVSTAT_ALL_SUPPORTED, DEVSTAT_TYPE_DIRECT | DEVSTAT_TYPE_IF_OTHER, DEVSTAT_PRIORITY_OTHER); xbb->xbb_stats_in = devstat_new_entry("xbbi", device_get_unit(xbb->dev), xbb->sector_size, DEVSTAT_ALL_SUPPORTED, DEVSTAT_TYPE_DIRECT | DEVSTAT_TYPE_IF_OTHER, DEVSTAT_PRIORITY_OTHER); /* * Setup sysctl variables. */ xbb_setup_sysctl(xbb); /* * Create a taskqueue for doing work that must occur from a * thread context. */ xbb->io_taskqueue = taskqueue_create_fast(device_get_nameunit(dev), M_NOWAIT, taskqueue_thread_enqueue, /*contxt*/&xbb->io_taskqueue); if (xbb->io_taskqueue == NULL) { xbb_attach_failed(xbb, error, "Unable to create taskqueue"); return; } taskqueue_start_threads(&xbb->io_taskqueue, /*num threads*/1, /*priority*/PWAIT, /*thread name*/ "%s taskq", device_get_nameunit(dev)); /* Update hot-plug status to satisfy xend. */ error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev), "hotplug-status", "connected"); if (error) { xbb_attach_failed(xbb, error, "writing %s/hotplug-status", xenbus_get_node(xbb->dev)); return; } xbb->hotplug_done = true; /* The front end might be waiting for the backend, attach if so. */ if (xenbus_get_otherend_state(xbb->dev) == XenbusStateInitialised) xbb_connect(xbb); } /** * Attach to a XenBus device that has been claimed by our probe routine. * * \param dev NewBus device object representing this Xen Block Back instance. * * \return 0 for success, errno codes for failure. */ static int xbb_attach(device_t dev) { struct xbb_softc *xbb; int error; u_int max_ring_page_order; struct sbuf *watch_path; DPRINTF("Attaching to %s\n", xenbus_get_node(dev)); /* * Basic initialization. * After this block it is safe to call xbb_detach() * to clean up any allocated data for this instance. */ xbb = device_get_softc(dev); xbb->dev = dev; xbb->otherend_id = xenbus_get_otherend_id(dev); TASK_INIT(&xbb->io_task, /*priority*/0, xbb_run_queue, xbb); mtx_init(&xbb->lock, device_get_nameunit(dev), NULL, MTX_DEF); /* * Publish protocol capabilities for consumption by the * front-end. */ error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev), "feature-barrier", "1"); if (error) { xbb_attach_failed(xbb, error, "writing %s/feature-barrier", xenbus_get_node(xbb->dev)); return (error); } error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev), "feature-flush-cache", "1"); if (error) { xbb_attach_failed(xbb, error, "writing %s/feature-flush-cache", xenbus_get_node(xbb->dev)); return (error); } max_ring_page_order = flsl(XBB_MAX_RING_PAGES) - 1; error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev), "max-ring-page-order", "%u", max_ring_page_order); if (error) { xbb_attach_failed(xbb, error, "writing %s/max-ring-page-order", xenbus_get_node(xbb->dev)); return (error); } /* * We need to wait for hotplug script execution before * moving forward. */ KASSERT(!xbb->hotplug_done, ("Hotplug scripts already executed")); watch_path = xs_join(xenbus_get_node(xbb->dev), "physical-device-path"); xbb->hotplug_watch.callback_data = (uintptr_t)dev; xbb->hotplug_watch.callback = xbb_attach_disk; KASSERT(xbb->hotplug_watch.node == NULL, ("watch node already setup")); xbb->hotplug_watch.node = strdup(sbuf_data(watch_path), M_XENBLOCKBACK); sbuf_delete(watch_path); error = xs_register_watch(&xbb->hotplug_watch); if (error != 0) { xbb_attach_failed(xbb, error, "failed to create watch on %s", xbb->hotplug_watch.node); free(xbb->hotplug_watch.node, M_XENBLOCKBACK); return (error); } /* Tell the toolstack blkback has attached. */ xenbus_set_state(dev, XenbusStateInitWait); return (0); } /** * Detach from a block back device instance. * * \param dev NewBus device object representing this Xen Block Back instance. * * \return 0 for success, errno codes for failure. * * \note A block back device may be detached at any time in its life-cycle, * including part way through the attach process. For this reason, * initialization order and the initialization state checks in this * routine must be carefully coupled so that attach time failures * are gracefully handled. */ static int xbb_detach(device_t dev) { struct xbb_softc *xbb; DPRINTF("\n"); xbb = device_get_softc(dev); mtx_lock(&xbb->lock); while (xbb_shutdown(xbb) == EAGAIN) { msleep(xbb, &xbb->lock, /*wakeup prio unchanged*/0, "xbb_shutdown", 0); } mtx_unlock(&xbb->lock); DPRINTF("\n"); if (xbb->io_taskqueue != NULL) taskqueue_free(xbb->io_taskqueue); if (xbb->xbb_stats != NULL) devstat_remove_entry(xbb->xbb_stats); if (xbb->xbb_stats_in != NULL) devstat_remove_entry(xbb->xbb_stats_in); xbb_close_backend(xbb); if (xbb->dev_mode != NULL) { free(xbb->dev_mode, M_XENSTORE); xbb->dev_mode = NULL; } if (xbb->dev_type != NULL) { free(xbb->dev_type, M_XENSTORE); xbb->dev_type = NULL; } if (xbb->dev_name != NULL) { free(xbb->dev_name, M_XENSTORE); xbb->dev_name = NULL; } mtx_destroy(&xbb->lock); return (0); } /** * Prepare this block back device for suspension of this VM. * * \param dev NewBus device object representing this Xen Block Back instance. * * \return 0 for success, errno codes for failure. */ static int xbb_suspend(device_t dev) { #ifdef NOT_YET struct xbb_softc *sc = device_get_softc(dev); /* Prevent new requests being issued until we fix things up. */ mtx_lock(&sc->xb_io_lock); sc->connected = BLKIF_STATE_SUSPENDED; mtx_unlock(&sc->xb_io_lock); #endif return (0); } /** * Perform any processing required to recover from a suspended state. * * \param dev NewBus device object representing this Xen Block Back instance. * * \return 0 for success, errno codes for failure. */ static int xbb_resume(device_t dev) { return (0); } /** * Handle state changes expressed via the XenStore by our front-end peer. * * \param dev NewBus device object representing this Xen * Block Back instance. * \param frontend_state The new state of the front-end. * * \return 0 for success, errno codes for failure. */ static void xbb_frontend_changed(device_t dev, XenbusState frontend_state) { struct xbb_softc *xbb = device_get_softc(dev); DPRINTF("frontend_state=%s, xbb_state=%s\n", xenbus_strstate(frontend_state), xenbus_strstate(xenbus_get_state(xbb->dev))); switch (frontend_state) { case XenbusStateInitialising: break; case XenbusStateInitialised: case XenbusStateConnected: xbb_connect(xbb); break; case XenbusStateClosing: case XenbusStateClosed: mtx_lock(&xbb->lock); xbb_shutdown(xbb); mtx_unlock(&xbb->lock); if (frontend_state == XenbusStateClosed) xenbus_set_state(xbb->dev, XenbusStateClosed); break; default: xenbus_dev_fatal(xbb->dev, EINVAL, "saw state %d at frontend", frontend_state); break; } } /*---------------------------- NewBus Registration ---------------------------*/ static device_method_t xbb_methods[] = { /* Device interface */ DEVMETHOD(device_probe, xbb_probe), DEVMETHOD(device_attach, xbb_attach), DEVMETHOD(device_detach, xbb_detach), DEVMETHOD(device_shutdown, bus_generic_shutdown), DEVMETHOD(device_suspend, xbb_suspend), DEVMETHOD(device_resume, xbb_resume), /* Xenbus interface */ DEVMETHOD(xenbus_otherend_changed, xbb_frontend_changed), { 0, 0 } }; static driver_t xbb_driver = { "xbbd", xbb_methods, sizeof(struct xbb_softc), }; devclass_t xbb_devclass; DRIVER_MODULE(xbbd, xenbusb_back, xbb_driver, xbb_devclass, 0, 0); Index: head/sys/fs/cd9660/cd9660_vfsops.c =================================================================== --- head/sys/fs/cd9660/cd9660_vfsops.c (revision 364371) +++ head/sys/fs/cd9660/cd9660_vfsops.c (revision 364372) @@ -1,852 +1,852 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1994 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley * by Pace Willisson (pace@blitz.com). The Rock Ridge Extension * Support code is derived from software contributed to Berkeley * by Atsushi Murai (amurai@spec.co.jp). * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)cd9660_vfsops.c 8.18 (Berkeley) 5/22/95 */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include MALLOC_DEFINE(M_ISOFSMNT, "isofs_mount", "ISOFS mount structure"); MALLOC_DEFINE(M_ISOFSNODE, "isofs_node", "ISOFS vnode private part"); struct iconv_functions *cd9660_iconv = NULL; static vfs_mount_t cd9660_mount; static vfs_cmount_t cd9660_cmount; static vfs_unmount_t cd9660_unmount; static vfs_root_t cd9660_root; static vfs_statfs_t cd9660_statfs; static vfs_vget_t cd9660_vget; static vfs_fhtovp_t cd9660_fhtovp; static struct vfsops cd9660_vfsops = { .vfs_fhtovp = cd9660_fhtovp, .vfs_mount = cd9660_mount, .vfs_cmount = cd9660_cmount, .vfs_root = cd9660_root, .vfs_statfs = cd9660_statfs, .vfs_unmount = cd9660_unmount, .vfs_vget = cd9660_vget, }; VFS_SET(cd9660_vfsops, cd9660, VFCF_READONLY); MODULE_VERSION(cd9660, 1); static int cd9660_vfs_hash_cmp(struct vnode *vp, void *pino); static int iso_mountfs(struct vnode *devvp, struct mount *mp); /* * VFS Operations. */ static int cd9660_cmount(struct mntarg *ma, void *data, uint64_t flags) { struct iso_args args; int error; error = copyin(data, &args, sizeof args); if (error) return (error); ma = mount_argsu(ma, "from", args.fspec, MAXPATHLEN); ma = mount_arg(ma, "export", &args.export, sizeof(args.export)); ma = mount_argsu(ma, "cs_disk", args.cs_disk, 64); ma = mount_argsu(ma, "cs_local", args.cs_local, 64); ma = mount_argf(ma, "ssector", "%u", args.ssector); ma = mount_argb(ma, !(args.flags & ISOFSMNT_NORRIP), "norrip"); ma = mount_argb(ma, args.flags & ISOFSMNT_GENS, "nogens"); ma = mount_argb(ma, args.flags & ISOFSMNT_EXTATT, "noextatt"); ma = mount_argb(ma, !(args.flags & ISOFSMNT_NOJOLIET), "nojoliet"); ma = mount_argb(ma, args.flags & ISOFSMNT_BROKENJOLIET, "nobrokenjoliet"); ma = mount_argb(ma, args.flags & ISOFSMNT_KICONV, "nokiconv"); error = kernel_mount(ma, flags); return (error); } static int cd9660_mount(struct mount *mp) { struct vnode *devvp; struct thread *td; char *fspec; int error; accmode_t accmode; struct nameidata ndp; struct iso_mnt *imp = NULL; td = curthread; /* * Unconditionally mount as read-only. */ MNT_ILOCK(mp); mp->mnt_flag |= MNT_RDONLY; MNT_IUNLOCK(mp); fspec = vfs_getopts(mp->mnt_optnew, "from", &error); if (error) return (error); imp = VFSTOISOFS(mp); if (mp->mnt_flag & MNT_UPDATE) { if (vfs_flagopt(mp->mnt_optnew, "export", NULL, 0)) return (0); } /* * Not an update, or updating the name: look up the name * and verify that it refers to a sensible block device. */ NDINIT(&ndp, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, fspec, td); if ((error = namei(&ndp))) return (error); NDFREE(&ndp, NDF_ONLY_PNBUF); devvp = ndp.ni_vp; - if (!vn_isdisk(devvp, &error)) { + if (!vn_isdisk_error(devvp, &error)) { vput(devvp); return (error); } /* * Verify that user has necessary permissions on the device, * or has superuser abilities */ accmode = VREAD; error = VOP_ACCESS(devvp, accmode, td->td_ucred, td); if (error) error = priv_check(td, PRIV_VFS_MOUNT_PERM); if (error) { vput(devvp); return (error); } if ((mp->mnt_flag & MNT_UPDATE) == 0) { error = iso_mountfs(devvp, mp); if (error) vrele(devvp); } else { if (devvp != imp->im_devvp) error = EINVAL; /* needs translation */ vput(devvp); } if (error) return (error); vfs_mountedfrom(mp, fspec); return (0); } /* * Common code for mount and mountroot */ static int iso_mountfs(devvp, mp) struct vnode *devvp; struct mount *mp; { struct iso_mnt *isomp = NULL; struct buf *bp = NULL; struct buf *pribp = NULL, *supbp = NULL; struct cdev *dev; int error = EINVAL; int high_sierra = 0; int iso_bsize; int iso_blknum; int joliet_level; int isverified = 0; struct iso_volume_descriptor *vdp = NULL; struct iso_primary_descriptor *pri = NULL; struct iso_sierra_primary_descriptor *pri_sierra = NULL; struct iso_supplementary_descriptor *sup = NULL; struct iso_directory_record *rootp; int logical_block_size, ssector; struct g_consumer *cp; struct bufobj *bo; char *cs_local, *cs_disk; dev = devvp->v_rdev; dev_ref(dev); g_topology_lock(); error = g_vfs_open(devvp, &cp, "cd9660", 0); if (error == 0) g_getattr("MNT::verified", cp, &isverified); g_topology_unlock(); VOP_UNLOCK(devvp); if (error) goto out; if (devvp->v_rdev->si_iosize_max != 0) mp->mnt_iosize_max = devvp->v_rdev->si_iosize_max; if (mp->mnt_iosize_max > MAXPHYS) mp->mnt_iosize_max = MAXPHYS; bo = &devvp->v_bufobj; /* This is the "logical sector size". The standard says this * should be 2048 or the physical sector size on the device, * whichever is greater. */ if ((ISO_DEFAULT_BLOCK_SIZE % cp->provider->sectorsize) != 0) { error = EINVAL; goto out; } iso_bsize = cp->provider->sectorsize; joliet_level = 0; if (1 != vfs_scanopt(mp->mnt_optnew, "ssector", "%d", &ssector)) ssector = 0; for (iso_blknum = 16 + ssector; iso_blknum < 100 + ssector; iso_blknum++) { if ((error = bread(devvp, iso_blknum * btodb(ISO_DEFAULT_BLOCK_SIZE), iso_bsize, NOCRED, &bp)) != 0) goto out; vdp = (struct iso_volume_descriptor *)bp->b_data; if (bcmp (vdp->id, ISO_STANDARD_ID, sizeof vdp->id) != 0) { if (bcmp (vdp->id_sierra, ISO_SIERRA_ID, sizeof vdp->id_sierra) != 0) { error = EINVAL; goto out; } else high_sierra = 1; } switch (isonum_711 (high_sierra? vdp->type_sierra: vdp->type)){ case ISO_VD_PRIMARY: if (pribp == NULL) { pribp = bp; bp = NULL; pri = (struct iso_primary_descriptor *)vdp; pri_sierra = (struct iso_sierra_primary_descriptor *)vdp; } break; case ISO_VD_SUPPLEMENTARY: if (supbp == NULL) { supbp = bp; bp = NULL; sup = (struct iso_supplementary_descriptor *)vdp; if (!vfs_flagopt(mp->mnt_optnew, "nojoliet", NULL, 0)) { if (bcmp(sup->escape, "%/@", 3) == 0) joliet_level = 1; if (bcmp(sup->escape, "%/C", 3) == 0) joliet_level = 2; if (bcmp(sup->escape, "%/E", 3) == 0) joliet_level = 3; if ((isonum_711 (sup->flags) & 1) && !vfs_flagopt(mp->mnt_optnew, "brokenjoliet", NULL, 0)) joliet_level = 0; } } break; case ISO_VD_END: goto vd_end; default: break; } if (bp != NULL) { brelse(bp); bp = NULL; } } vd_end: if (bp != NULL) { brelse(bp); bp = NULL; } if (pri == NULL) { error = EINVAL; goto out; } logical_block_size = isonum_723 (high_sierra? pri_sierra->logical_block_size: pri->logical_block_size); if (logical_block_size < DEV_BSIZE || logical_block_size > MAXBSIZE || (logical_block_size & (logical_block_size - 1)) != 0) { error = EINVAL; goto out; } rootp = (struct iso_directory_record *) (high_sierra? pri_sierra->root_directory_record: pri->root_directory_record); isomp = malloc(sizeof *isomp, M_ISOFSMNT, M_WAITOK | M_ZERO); isomp->im_cp = cp; isomp->im_bo = bo; isomp->logical_block_size = logical_block_size; isomp->volume_space_size = isonum_733 (high_sierra? pri_sierra->volume_space_size: pri->volume_space_size); isomp->joliet_level = 0; /* * Since an ISO9660 multi-session CD can also access previous * sessions, we have to include them into the space consider- * ations. This doesn't yield a very accurate number since * parts of the old sessions might be inaccessible now, but we * can't do much better. This is also important for the NFS * filehandle validation. */ isomp->volume_space_size += ssector; memcpy(isomp->root, rootp, sizeof isomp->root); isomp->root_extent = isonum_733 (rootp->extent); isomp->root_size = isonum_733 (rootp->size); isomp->im_bmask = logical_block_size - 1; isomp->im_bshift = ffs(logical_block_size) - 1; pribp->b_flags |= B_AGE; brelse(pribp); pribp = NULL; rootp = NULL; pri = NULL; pri_sierra = NULL; mp->mnt_data = isomp; mp->mnt_stat.f_fsid.val[0] = dev2udev(dev); mp->mnt_stat.f_fsid.val[1] = mp->mnt_vfc->vfc_typenum; mp->mnt_maxsymlinklen = 0; MNT_ILOCK(mp); if (isverified) mp->mnt_flag |= MNT_VERIFIED; mp->mnt_flag |= MNT_LOCAL; mp->mnt_kern_flag |= MNTK_LOOKUP_SHARED | MNTK_EXTENDED_SHARED; MNT_IUNLOCK(mp); isomp->im_mountp = mp; isomp->im_dev = dev; isomp->im_devvp = devvp; vfs_flagopt(mp->mnt_optnew, "norrip", &isomp->im_flags, ISOFSMNT_NORRIP); vfs_flagopt(mp->mnt_optnew, "gens", &isomp->im_flags, ISOFSMNT_GENS); vfs_flagopt(mp->mnt_optnew, "extatt", &isomp->im_flags, ISOFSMNT_EXTATT); vfs_flagopt(mp->mnt_optnew, "nojoliet", &isomp->im_flags, ISOFSMNT_NOJOLIET); vfs_flagopt(mp->mnt_optnew, "kiconv", &isomp->im_flags, ISOFSMNT_KICONV); /* Check the Rock Ridge Extension support */ if (!(isomp->im_flags & ISOFSMNT_NORRIP)) { if ((error = bread(isomp->im_devvp, (isomp->root_extent + isonum_711(((struct iso_directory_record *)isomp->root)-> ext_attr_length)) << (isomp->im_bshift - DEV_BSHIFT), isomp->logical_block_size, NOCRED, &bp)) != 0) goto out; rootp = (struct iso_directory_record *)bp->b_data; if ((isomp->rr_skip = cd9660_rrip_offset(rootp,isomp)) < 0) { isomp->im_flags |= ISOFSMNT_NORRIP; } else { isomp->im_flags &= ~ISOFSMNT_GENS; } /* * The contents are valid, * but they will get reread as part of another vnode, so... */ bp->b_flags |= B_AGE; brelse(bp); bp = NULL; rootp = NULL; } if (isomp->im_flags & ISOFSMNT_KICONV && cd9660_iconv) { cs_local = vfs_getopts(mp->mnt_optnew, "cs_local", &error); if (error) goto out; cs_disk = vfs_getopts(mp->mnt_optnew, "cs_disk", &error); if (error) goto out; cd9660_iconv->open(cs_local, cs_disk, &isomp->im_d2l); cd9660_iconv->open(cs_disk, cs_local, &isomp->im_l2d); } else { isomp->im_d2l = NULL; isomp->im_l2d = NULL; } if (high_sierra) { /* this effectively ignores all the mount flags */ if (bootverbose) log(LOG_INFO, "cd9660: High Sierra Format\n"); isomp->iso_ftype = ISO_FTYPE_HIGH_SIERRA; } else switch (isomp->im_flags&(ISOFSMNT_NORRIP|ISOFSMNT_GENS)) { default: isomp->iso_ftype = ISO_FTYPE_DEFAULT; break; case ISOFSMNT_GENS|ISOFSMNT_NORRIP: isomp->iso_ftype = ISO_FTYPE_9660; break; case 0: if (bootverbose) log(LOG_INFO, "cd9660: RockRidge Extension\n"); isomp->iso_ftype = ISO_FTYPE_RRIP; break; } /* Decide whether to use the Joliet descriptor */ if (isomp->iso_ftype != ISO_FTYPE_RRIP && joliet_level) { if (bootverbose) log(LOG_INFO, "cd9660: Joliet Extension (Level %d)\n", joliet_level); rootp = (struct iso_directory_record *) sup->root_directory_record; memcpy(isomp->root, rootp, sizeof isomp->root); isomp->root_extent = isonum_733 (rootp->extent); isomp->root_size = isonum_733 (rootp->size); isomp->joliet_level = joliet_level; supbp->b_flags |= B_AGE; } if (supbp) { brelse(supbp); supbp = NULL; sup = NULL; } return 0; out: if (bp != NULL) brelse(bp); if (pribp != NULL) brelse(pribp); if (supbp != NULL) brelse(supbp); if (cp != NULL) { g_topology_lock(); g_vfs_close(cp); g_topology_unlock(); } if (isomp) { free(isomp, M_ISOFSMNT); mp->mnt_data = NULL; } dev_rel(dev); return error; } /* * unmount system call */ static int cd9660_unmount(mp, mntflags) struct mount *mp; int mntflags; { struct iso_mnt *isomp; int error, flags = 0; if (mntflags & MNT_FORCE) flags |= FORCECLOSE; if ((error = vflush(mp, 0, flags, curthread))) return (error); isomp = VFSTOISOFS(mp); if (isomp->im_flags & ISOFSMNT_KICONV && cd9660_iconv) { if (isomp->im_d2l) cd9660_iconv->close(isomp->im_d2l); if (isomp->im_l2d) cd9660_iconv->close(isomp->im_l2d); } g_topology_lock(); g_vfs_close(isomp->im_cp); g_topology_unlock(); vrele(isomp->im_devvp); dev_rel(isomp->im_dev); free(isomp, M_ISOFSMNT); mp->mnt_data = NULL; MNT_ILOCK(mp); mp->mnt_flag &= ~MNT_LOCAL; MNT_IUNLOCK(mp); return (error); } /* * Return root of a filesystem */ static int cd9660_root(mp, flags, vpp) struct mount *mp; int flags; struct vnode **vpp; { struct iso_mnt *imp = VFSTOISOFS(mp); struct iso_directory_record *dp = (struct iso_directory_record *)imp->root; cd_ino_t ino = isodirino(dp, imp); /* * With RRIP we must use the `.' entry of the root directory. * Simply tell vget, that it's a relocated directory. */ return (cd9660_vget_internal(mp, ino, flags, vpp, imp->iso_ftype == ISO_FTYPE_RRIP, dp)); } /* * Get filesystem statistics. */ static int cd9660_statfs(mp, sbp) struct mount *mp; struct statfs *sbp; { struct iso_mnt *isomp; isomp = VFSTOISOFS(mp); sbp->f_bsize = isomp->logical_block_size; sbp->f_iosize = sbp->f_bsize; /* XXX */ sbp->f_blocks = isomp->volume_space_size; sbp->f_bfree = 0; /* total free blocks */ sbp->f_bavail = 0; /* blocks free for non superuser */ sbp->f_files = 0; /* total files */ sbp->f_ffree = 0; /* free file nodes */ return 0; } /* * File handle to vnode * * Have to be really careful about stale file handles: * - check that the inode number is in range * - call iget() to get the locked inode * - check for an unallocated inode (i_mode == 0) * - check that the generation number matches */ /* ARGSUSED */ static int cd9660_fhtovp(mp, fhp, flags, vpp) struct mount *mp; struct fid *fhp; int flags; struct vnode **vpp; { struct ifid ifh; struct iso_node *ip; struct vnode *nvp; int error; memcpy(&ifh, fhp, sizeof(ifh)); #ifdef ISOFS_DBG printf("fhtovp: ino %d, start %ld\n", ifh.ifid_ino, ifh.ifid_start); #endif if ((error = VFS_VGET(mp, ifh.ifid_ino, LK_EXCLUSIVE, &nvp)) != 0) { *vpp = NULLVP; return (error); } ip = VTOI(nvp); if (ip->inode.iso_mode == 0) { vput(nvp); *vpp = NULLVP; return (ESTALE); } *vpp = nvp; vnode_create_vobject(*vpp, ip->i_size, curthread); return (0); } /* * Conform to standard VFS interface; can't vget arbitrary inodes beyond 4GB * into media with current inode scheme and 32-bit ino_t. This shouldn't be * needed for anything other than nfsd, and who exports a mounted DVD over NFS? */ static int cd9660_vget(mp, ino, flags, vpp) struct mount *mp; ino_t ino; int flags; struct vnode **vpp; { /* * XXXX * It would be nice if we didn't always set the `relocated' flag * and force the extra read, but I don't want to think about fixing * that right now. */ return (cd9660_vget_internal(mp, ino, flags, vpp, #if 0 VFSTOISOFS(mp)->iso_ftype == ISO_FTYPE_RRIP, #else 0, #endif (struct iso_directory_record *)0)); } /* Use special comparator for full 64-bit ino comparison. */ static int cd9660_vfs_hash_cmp(vp, pino) struct vnode *vp; void *pino; { struct iso_node *ip; cd_ino_t ino; ip = VTOI(vp); ino = *(cd_ino_t *)pino; return (ip->i_number != ino); } int cd9660_vget_internal(mp, ino, flags, vpp, relocated, isodir) struct mount *mp; cd_ino_t ino; int flags; struct vnode **vpp; int relocated; struct iso_directory_record *isodir; { struct iso_mnt *imp; struct iso_node *ip; struct buf *bp; struct vnode *vp; int error; struct thread *td; td = curthread; error = vfs_hash_get(mp, ino, flags, td, vpp, cd9660_vfs_hash_cmp, &ino); if (error || *vpp != NULL) return (error); /* * We must promote to an exclusive lock for vnode creation. This * can happen if lookup is passed LOCKSHARED. */ if ((flags & LK_TYPE_MASK) == LK_SHARED) { flags &= ~LK_TYPE_MASK; flags |= LK_EXCLUSIVE; } /* * We do not lock vnode creation as it is believed to be too * expensive for such rare case as simultaneous creation of vnode * for same ino by different processes. We just allow them to race * and check later to decide who wins. Let the race begin! */ imp = VFSTOISOFS(mp); /* Allocate a new vnode/iso_node. */ if ((error = getnewvnode("isofs", mp, &cd9660_vnodeops, &vp)) != 0) { *vpp = NULLVP; return (error); } ip = malloc(sizeof(struct iso_node), M_ISOFSNODE, M_WAITOK | M_ZERO); vp->v_data = ip; ip->i_vnode = vp; ip->i_number = ino; lockmgr(vp->v_vnlock, LK_EXCLUSIVE, NULL); error = insmntque(vp, mp); if (error != 0) { free(ip, M_ISOFSNODE); *vpp = NULLVP; return (error); } error = vfs_hash_insert(vp, ino, flags, td, vpp, cd9660_vfs_hash_cmp, &ino); if (error || *vpp != NULL) return (error); if (isodir == NULL) { int lbn, off; lbn = lblkno(imp, ino); if (lbn >= imp->volume_space_size) { vput(vp); printf("fhtovp: lbn exceed volume space %d\n", lbn); return (ESTALE); } off = blkoff(imp, ino); if (off + ISO_DIRECTORY_RECORD_SIZE > imp->logical_block_size) { vput(vp); printf("fhtovp: crosses block boundary %d\n", off + ISO_DIRECTORY_RECORD_SIZE); return (ESTALE); } error = bread(imp->im_devvp, lbn << (imp->im_bshift - DEV_BSHIFT), imp->logical_block_size, NOCRED, &bp); if (error) { vput(vp); printf("fhtovp: bread error %d\n",error); return (error); } isodir = (struct iso_directory_record *)(bp->b_data + off); if (off + isonum_711(isodir->length) > imp->logical_block_size) { vput(vp); brelse(bp); printf("fhtovp: directory crosses block boundary %d[off=%d/len=%d]\n", off +isonum_711(isodir->length), off, isonum_711(isodir->length)); return (ESTALE); } #if 0 if (isonum_733(isodir->extent) + isonum_711(isodir->ext_attr_length) != ifhp->ifid_start) { brelse(bp); printf("fhtovp: file start miss %d vs %d\n", isonum_733(isodir->extent) + isonum_711(isodir->ext_attr_length), ifhp->ifid_start); return (ESTALE); } #endif } else bp = NULL; ip->i_mnt = imp; if (relocated) { /* * On relocated directories we must * read the `.' entry out of a dir. */ ip->iso_start = ino >> imp->im_bshift; if (bp != NULL) brelse(bp); if ((error = cd9660_blkatoff(vp, (off_t)0, NULL, &bp)) != 0) { vput(vp); return (error); } isodir = (struct iso_directory_record *)bp->b_data; } ip->iso_extent = isonum_733(isodir->extent); ip->i_size = isonum_733(isodir->size); ip->iso_start = isonum_711(isodir->ext_attr_length) + ip->iso_extent; /* * Setup time stamp, attribute */ vp->v_type = VNON; switch (imp->iso_ftype) { default: /* ISO_FTYPE_9660 */ { struct buf *bp2; int off; if ((imp->im_flags & ISOFSMNT_EXTATT) && (off = isonum_711(isodir->ext_attr_length))) cd9660_blkatoff(vp, (off_t)-(off << imp->im_bshift), NULL, &bp2); else bp2 = NULL; cd9660_defattr(isodir, ip, bp2, ISO_FTYPE_9660); cd9660_deftstamp(isodir, ip, bp2, ISO_FTYPE_9660); if (bp2) brelse(bp2); break; } case ISO_FTYPE_RRIP: cd9660_rrip_analyze(isodir, ip, imp); break; } brelse(bp); /* * Initialize the associated vnode */ switch (vp->v_type = IFTOVT(ip->inode.iso_mode)) { case VFIFO: vp->v_op = &cd9660_fifoops; break; default: VN_LOCK_ASHARE(vp); break; } if (ip->iso_extent == imp->root_extent) vp->v_vflag |= VV_ROOT; /* * XXX need generation number? */ *vpp = vp; return (0); } Index: head/sys/fs/ext2fs/ext2_vfsops.c =================================================================== --- head/sys/fs/ext2fs/ext2_vfsops.c (revision 364371) +++ head/sys/fs/ext2fs/ext2_vfsops.c (revision 364372) @@ -1,1445 +1,1445 @@ /*- * modified for EXT2FS support in Lites 1.1 * * Aug 1995, Godmar Back (gback@cs.utah.edu) * University of Utah, Department of Computer Science */ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1989, 1991, 1993, 1994 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ffs_vfsops.c 8.8 (Berkeley) 4/18/94 * $FreeBSD$ */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include SDT_PROVIDER_DECLARE(ext2fs); /* * ext2fs trace probe: * arg0: verbosity. Higher numbers give more verbose messages * arg1: Textual message */ SDT_PROBE_DEFINE2(ext2fs, , vfsops, trace, "int", "char*"); SDT_PROBE_DEFINE2(ext2fs, , vfsops, ext2_cg_validate_error, "char*", "int"); SDT_PROBE_DEFINE1(ext2fs, , vfsops, ext2_compute_sb_data_error, "char*"); static int ext2_flushfiles(struct mount *mp, int flags, struct thread *td); static int ext2_mountfs(struct vnode *, struct mount *); static int ext2_reload(struct mount *mp, struct thread *td); static int ext2_sbupdate(struct ext2mount *, int); static int ext2_cgupdate(struct ext2mount *, int); static vfs_unmount_t ext2_unmount; static vfs_root_t ext2_root; static vfs_statfs_t ext2_statfs; static vfs_sync_t ext2_sync; static vfs_vget_t ext2_vget; static vfs_fhtovp_t ext2_fhtovp; static vfs_mount_t ext2_mount; MALLOC_DEFINE(M_EXT2NODE, "ext2_node", "EXT2 vnode private part"); static MALLOC_DEFINE(M_EXT2MNT, "ext2_mount", "EXT2 mount structure"); static struct vfsops ext2fs_vfsops = { .vfs_fhtovp = ext2_fhtovp, .vfs_mount = ext2_mount, .vfs_root = ext2_root, /* root inode via vget */ .vfs_statfs = ext2_statfs, .vfs_sync = ext2_sync, .vfs_unmount = ext2_unmount, .vfs_vget = ext2_vget, }; VFS_SET(ext2fs_vfsops, ext2fs, 0); static int ext2_check_sb_compat(struct ext2fs *es, struct cdev *dev, int ronly); static int ext2_compute_sb_data(struct vnode * devvp, struct ext2fs * es, struct m_ext2fs * fs); static const char *ext2_opts[] = { "acls", "async", "noatime", "noclusterr", "noclusterw", "noexec", "export", "force", "from", "multilabel", "suiddir", "nosymfollow", "sync", "union", NULL }; /* * VFS Operations. * * mount system call */ static int ext2_mount(struct mount *mp) { struct vfsoptlist *opts; struct vnode *devvp; struct thread *td; struct ext2mount *ump = NULL; struct m_ext2fs *fs; struct nameidata nd, *ndp = &nd; accmode_t accmode; char *path, *fspec; int error, flags, len; td = curthread; opts = mp->mnt_optnew; if (vfs_filteropt(opts, ext2_opts)) return (EINVAL); vfs_getopt(opts, "fspath", (void **)&path, NULL); /* Double-check the length of path.. */ if (strlen(path) >= MAXMNTLEN) return (ENAMETOOLONG); fspec = NULL; error = vfs_getopt(opts, "from", (void **)&fspec, &len); if (!error && fspec[len - 1] != '\0') return (EINVAL); /* * If updating, check whether changing from read-only to * read/write; if there is no device name, that's all we do. */ if (mp->mnt_flag & MNT_UPDATE) { ump = VFSTOEXT2(mp); fs = ump->um_e2fs; error = 0; if (fs->e2fs_ronly == 0 && vfs_flagopt(opts, "ro", NULL, 0)) { error = VFS_SYNC(mp, MNT_WAIT); if (error) return (error); flags = WRITECLOSE; if (mp->mnt_flag & MNT_FORCE) flags |= FORCECLOSE; error = ext2_flushfiles(mp, flags, td); if (error == 0 && fs->e2fs_wasvalid && ext2_cgupdate(ump, MNT_WAIT) == 0) { fs->e2fs->e2fs_state = htole16((le16toh(fs->e2fs->e2fs_state) | E2FS_ISCLEAN)); ext2_sbupdate(ump, MNT_WAIT); } fs->e2fs_ronly = 1; vfs_flagopt(opts, "ro", &mp->mnt_flag, MNT_RDONLY); g_topology_lock(); g_access(ump->um_cp, 0, -1, 0); g_topology_unlock(); } if (!error && (mp->mnt_flag & MNT_RELOAD)) error = ext2_reload(mp, td); if (error) return (error); devvp = ump->um_devvp; if (fs->e2fs_ronly && !vfs_flagopt(opts, "ro", NULL, 0)) { if (ext2_check_sb_compat(fs->e2fs, devvp->v_rdev, 0)) return (EPERM); /* * If upgrade to read-write by non-root, then verify * that user has necessary permissions on the device. */ vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY); error = VOP_ACCESS(devvp, VREAD | VWRITE, td->td_ucred, td); if (error) error = priv_check(td, PRIV_VFS_MOUNT_PERM); if (error) { VOP_UNLOCK(devvp); return (error); } VOP_UNLOCK(devvp); g_topology_lock(); error = g_access(ump->um_cp, 0, 1, 0); g_topology_unlock(); if (error) return (error); if ((le16toh(fs->e2fs->e2fs_state) & E2FS_ISCLEAN) == 0 || (le16toh(fs->e2fs->e2fs_state) & E2FS_ERRORS)) { if (mp->mnt_flag & MNT_FORCE) { printf( "WARNING: %s was not properly dismounted\n", fs->e2fs_fsmnt); } else { printf( "WARNING: R/W mount of %s denied. Filesystem is not clean - run fsck\n", fs->e2fs_fsmnt); return (EPERM); } } fs->e2fs->e2fs_state = htole16(le16toh(fs->e2fs->e2fs_state) & ~E2FS_ISCLEAN); (void)ext2_cgupdate(ump, MNT_WAIT); fs->e2fs_ronly = 0; MNT_ILOCK(mp); mp->mnt_flag &= ~MNT_RDONLY; MNT_IUNLOCK(mp); } if (vfs_flagopt(opts, "export", NULL, 0)) { /* Process export requests in vfs_mount.c. */ return (error); } } /* * Not an update, or updating the name: look up the name * and verify that it refers to a sensible disk device. */ if (fspec == NULL) return (EINVAL); NDINIT(ndp, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, fspec, td); if ((error = namei(ndp)) != 0) return (error); NDFREE(ndp, NDF_ONLY_PNBUF); devvp = ndp->ni_vp; - if (!vn_isdisk(devvp, &error)) { + if (!vn_isdisk_error(devvp, &error)) { vput(devvp); return (error); } /* * If mount by non-root, then verify that user has necessary * permissions on the device. * * XXXRW: VOP_ACCESS() enough? */ accmode = VREAD; if ((mp->mnt_flag & MNT_RDONLY) == 0) accmode |= VWRITE; error = VOP_ACCESS(devvp, accmode, td->td_ucred, td); if (error) error = priv_check(td, PRIV_VFS_MOUNT_PERM); if (error) { vput(devvp); return (error); } if ((mp->mnt_flag & MNT_UPDATE) == 0) { error = ext2_mountfs(devvp, mp); } else { if (devvp != ump->um_devvp) { vput(devvp); return (EINVAL); /* needs translation */ } else vput(devvp); } if (error) { vrele(devvp); return (error); } ump = VFSTOEXT2(mp); fs = ump->um_e2fs; /* * Note that this strncpy() is ok because of a check at the start * of ext2_mount(). */ strncpy(fs->e2fs_fsmnt, path, MAXMNTLEN); fs->e2fs_fsmnt[MAXMNTLEN - 1] = '\0'; vfs_mountedfrom(mp, fspec); return (0); } static int ext2_check_sb_compat(struct ext2fs *es, struct cdev *dev, int ronly) { uint32_t i, mask; if (le16toh(es->e2fs_magic) != E2FS_MAGIC) { printf("ext2fs: %s: wrong magic number %#x (expected %#x)\n", devtoname(dev), le16toh(es->e2fs_magic), E2FS_MAGIC); return (1); } if (le32toh(es->e2fs_rev) > E2FS_REV0) { mask = le32toh(es->e2fs_features_incompat) & ~(EXT2F_INCOMPAT_SUPP); if (mask) { printf("WARNING: mount of %s denied due to " "unsupported optional features:\n", devtoname(dev)); for (i = 0; i < sizeof(incompat)/sizeof(struct ext2_feature); i++) if (mask & incompat[i].mask) printf("%s ", incompat[i].name); printf("\n"); return (1); } mask = le32toh(es->e2fs_features_rocompat) & ~EXT2F_ROCOMPAT_SUPP; if (!ronly && mask) { printf("WARNING: R/W mount of %s denied due to " "unsupported optional features:\n", devtoname(dev)); for (i = 0; i < sizeof(ro_compat)/sizeof(struct ext2_feature); i++) if (mask & ro_compat[i].mask) printf("%s ", ro_compat[i].name); printf("\n"); return (1); } } return (0); } static e4fs_daddr_t ext2_cg_location(struct m_ext2fs *fs, int number) { int cg, descpb, logical_sb, has_super = 0; /* * Adjust logical superblock block number. * Godmar thinks: if the blocksize is greater than 1024, then * the superblock is logically part of block zero. */ logical_sb = fs->e2fs_bsize > SBSIZE ? 0 : 1; if (!EXT2_HAS_INCOMPAT_FEATURE(fs, EXT2F_INCOMPAT_META_BG) || number < le32toh(fs->e2fs->e3fs_first_meta_bg)) return (logical_sb + number + 1); if (EXT2_HAS_INCOMPAT_FEATURE(fs, EXT2F_INCOMPAT_64BIT)) descpb = fs->e2fs_bsize / sizeof(struct ext2_gd); else descpb = fs->e2fs_bsize / E2FS_REV0_GD_SIZE; cg = descpb * number; if (ext2_cg_has_sb(fs, cg)) has_super = 1; return (has_super + cg * (e4fs_daddr_t)EXT2_BLOCKS_PER_GROUP(fs) + le32toh(fs->e2fs->e2fs_first_dblock)); } static int ext2_cg_validate(struct m_ext2fs *fs) { uint64_t b_bitmap; uint64_t i_bitmap; uint64_t i_tables; uint64_t first_block, last_block, last_cg_block; struct ext2_gd *gd; unsigned int i, cg_count; first_block = le32toh(fs->e2fs->e2fs_first_dblock); last_cg_block = ext2_cg_number_gdb(fs, 0); cg_count = fs->e2fs_gcount; for (i = 0; i < fs->e2fs_gcount; i++) { gd = &fs->e2fs_gd[i]; if (EXT2_HAS_INCOMPAT_FEATURE(fs, EXT2F_INCOMPAT_FLEX_BG) || i == fs->e2fs_gcount - 1) { last_block = fs->e2fs_bcount - 1; } else { last_block = first_block + (EXT2_BLOCKS_PER_GROUP(fs) - 1); } if ((cg_count == fs->e2fs_gcount) && !(le16toh(gd->ext4bgd_flags) & EXT2_BG_INODE_ZEROED)) cg_count = i; b_bitmap = e2fs_gd_get_b_bitmap(gd); if (b_bitmap == 0) { SDT_PROBE2(ext2fs, , vfsops, ext2_cg_validate_error, "block bitmap is zero", i); return (EINVAL); } if (b_bitmap <= last_cg_block) { SDT_PROBE2(ext2fs, , vfsops, ext2_cg_validate_error, "block bitmap overlaps gds", i); return (EINVAL); } if (b_bitmap < first_block || b_bitmap > last_block) { SDT_PROBE2(ext2fs, , vfsops, ext2_cg_validate_error, "block bitmap not in group", i); return (EINVAL); } i_bitmap = e2fs_gd_get_i_bitmap(gd); if (i_bitmap == 0) { SDT_PROBE2(ext2fs, , vfsops, ext2_cg_validate_error, "inode bitmap is zero", i); return (EINVAL); } if (i_bitmap <= last_cg_block) { SDT_PROBE2(ext2fs, , vfsops, ext2_cg_validate_error, "inode bitmap overlaps gds", i); return (EINVAL); } if (i_bitmap < first_block || i_bitmap > last_block) { SDT_PROBE2(ext2fs, , vfsops, ext2_cg_validate_error, "inode bitmap not in group blk", i); return (EINVAL); } i_tables = e2fs_gd_get_i_tables(gd); if (i_tables == 0) { SDT_PROBE2(ext2fs, , vfsops, ext2_cg_validate_error, "inode table is zero", i); return (EINVAL); } if (i_tables <= last_cg_block) { SDT_PROBE2(ext2fs, , vfsops, ext2_cg_validate_error, "inode talbes overlaps gds", i); return (EINVAL); } if (i_tables < first_block || i_tables + fs->e2fs_itpg - 1 > last_block) { SDT_PROBE2(ext2fs, , vfsops, ext2_cg_validate_error, "inode tables not in group blk", i); return (EINVAL); } if (!EXT2_HAS_INCOMPAT_FEATURE(fs, EXT2F_INCOMPAT_FLEX_BG)) first_block += EXT2_BLOCKS_PER_GROUP(fs); } return (0); } /* * This computes the fields of the m_ext2fs structure from the * data in the ext2fs structure read in. */ static int ext2_compute_sb_data(struct vnode *devvp, struct ext2fs *es, struct m_ext2fs *fs) { struct buf *bp; uint32_t e2fs_descpb, e2fs_gdbcount_alloc; int i, j; int g_count = 0; int error; /* Check checksum features */ if (EXT2_HAS_RO_COMPAT_FEATURE(fs, EXT2F_ROCOMPAT_GDT_CSUM) && EXT2_HAS_RO_COMPAT_FEATURE(fs, EXT2F_ROCOMPAT_METADATA_CKSUM)) { SDT_PROBE1(ext2fs, , vfsops, ext2_compute_sb_data_error, "incorrect checksum features combination"); return (EINVAL); } /* Precompute checksum seed for all metadata */ ext2_sb_csum_set_seed(fs); /* Verify sb csum if possible */ if (EXT2_HAS_RO_COMPAT_FEATURE(fs, EXT2F_ROCOMPAT_METADATA_CKSUM)) { error = ext2_sb_csum_verify(fs); if (error) { return (error); } } /* Check for block size = 1K|2K|4K */ if (le32toh(es->e2fs_log_bsize) > 2) { SDT_PROBE1(ext2fs, , vfsops, ext2_compute_sb_data_error, "bad block size"); return (EINVAL); } fs->e2fs_bshift = EXT2_MIN_BLOCK_LOG_SIZE + le32toh(es->e2fs_log_bsize); fs->e2fs_bsize = 1U << fs->e2fs_bshift; fs->e2fs_fsbtodb = le32toh(es->e2fs_log_bsize) + 1; fs->e2fs_qbmask = fs->e2fs_bsize - 1; /* Check for fragment size */ if (le32toh(es->e2fs_log_fsize) > (EXT2_MAX_FRAG_LOG_SIZE - EXT2_MIN_BLOCK_LOG_SIZE)) { SDT_PROBE1(ext2fs, , vfsops, ext2_compute_sb_data_error, "invalid log cluster size"); return (EINVAL); } fs->e2fs_fsize = EXT2_MIN_FRAG_SIZE << le32toh(es->e2fs_log_fsize); if (fs->e2fs_fsize != fs->e2fs_bsize) { SDT_PROBE1(ext2fs, , vfsops, ext2_compute_sb_data_error, "fragment size != block size"); return (EINVAL); } fs->e2fs_fpb = fs->e2fs_bsize / fs->e2fs_fsize; /* Check reserved gdt blocks for future filesystem expansion */ if (le16toh(es->e2fs_reserved_ngdb) > (fs->e2fs_bsize / 4)) { SDT_PROBE1(ext2fs, , vfsops, ext2_compute_sb_data_error, "number of reserved GDT blocks too large"); return (EINVAL); } if (le32toh(es->e2fs_rev) == E2FS_REV0) { fs->e2fs_isize = E2FS_REV0_INODE_SIZE; } else { fs->e2fs_isize = le16toh(es->e2fs_inode_size); /* * Check first ino. */ if (le32toh(es->e2fs_first_ino) < EXT2_FIRSTINO) { SDT_PROBE1(ext2fs, , vfsops, ext2_compute_sb_data_error, "invalid first ino"); return (EINVAL); } /* * Simple sanity check for superblock inode size value. */ if (EXT2_INODE_SIZE(fs) < E2FS_REV0_INODE_SIZE || EXT2_INODE_SIZE(fs) > fs->e2fs_bsize || (fs->e2fs_isize & (fs->e2fs_isize - 1)) != 0) { SDT_PROBE1(ext2fs, , vfsops, ext2_compute_sb_data_error, "invalid inode size"); return (EINVAL); } } /* Check group descriptors */ if (EXT2_HAS_INCOMPAT_FEATURE(fs, EXT2F_INCOMPAT_64BIT) && le16toh(es->e3fs_desc_size) != E2FS_64BIT_GD_SIZE) { SDT_PROBE1(ext2fs, , vfsops, ext2_compute_sb_data_error, "unsupported 64bit descriptor size"); return (EINVAL); } fs->e2fs_bpg = le32toh(es->e2fs_bpg); fs->e2fs_fpg = le32toh(es->e2fs_fpg); if (fs->e2fs_bpg == 0 || fs->e2fs_fpg == 0) { SDT_PROBE1(ext2fs, , vfsops, ext2_compute_sb_data_error, "zero blocks/fragments per group"); return (EINVAL); } else if (fs->e2fs_bpg != fs->e2fs_fpg) { SDT_PROBE1(ext2fs, , vfsops, ext2_compute_sb_data_error, "blocks per group not equal fragments per group"); return (EINVAL); } if (fs->e2fs_bpg != fs->e2fs_bsize * 8) { SDT_PROBE1(ext2fs, , vfsops, ext2_compute_sb_data_error, "non-standard group size unsupported"); return (EINVAL); } fs->e2fs_ipb = fs->e2fs_bsize / EXT2_INODE_SIZE(fs); if (fs->e2fs_ipb == 0 || fs->e2fs_ipb > fs->e2fs_bsize / E2FS_REV0_INODE_SIZE) { SDT_PROBE1(ext2fs, , vfsops, ext2_compute_sb_data_error, "bad inodes per block size"); return (EINVAL); } fs->e2fs_ipg = le32toh(es->e2fs_ipg); if (fs->e2fs_ipg < fs->e2fs_ipb || fs->e2fs_ipg > fs->e2fs_bsize * 8) { SDT_PROBE1(ext2fs, , vfsops, ext2_compute_sb_data_error, "invalid inodes per group"); return (EINVAL); } fs->e2fs_itpg = fs->e2fs_ipg / fs->e2fs_ipb; fs->e2fs_bcount = le32toh(es->e2fs_bcount); fs->e2fs_rbcount = le32toh(es->e2fs_rbcount); fs->e2fs_fbcount = le32toh(es->e2fs_fbcount); if (EXT2_HAS_INCOMPAT_FEATURE(fs, EXT2F_INCOMPAT_64BIT)) { fs->e2fs_bcount |= (uint64_t)(le32toh(es->e4fs_bcount_hi)) << 32; fs->e2fs_rbcount |= (uint64_t)(le32toh(es->e4fs_rbcount_hi)) << 32; fs->e2fs_fbcount |= (uint64_t)(le32toh(es->e4fs_fbcount_hi)) << 32; } if (fs->e2fs_rbcount > fs->e2fs_bcount || fs->e2fs_fbcount > fs->e2fs_bcount) { SDT_PROBE1(ext2fs, , vfsops, ext2_compute_sb_data_error, "invalid block count"); return (EINVAL); } fs->e2fs_ficount = le32toh(es->e2fs_ficount); if (fs->e2fs_ficount > le32toh(es->e2fs_icount)) { SDT_PROBE1(ext2fs, , vfsops, ext2_compute_sb_data_error, "invalid number of free inodes"); return (EINVAL); } if (le32toh(es->e2fs_first_dblock) >= fs->e2fs_bcount) { SDT_PROBE1(ext2fs, , vfsops, ext2_compute_sb_data_error, "first data block out of range"); return (EINVAL); } fs->e2fs_gcount = howmany(fs->e2fs_bcount - le32toh(es->e2fs_first_dblock), EXT2_BLOCKS_PER_GROUP(fs)); if (fs->e2fs_gcount > ((uint64_t)1 << 32) - EXT2_DESCS_PER_BLOCK(fs)) { SDT_PROBE1(ext2fs, , vfsops, ext2_compute_sb_data_error, "groups count too large"); return (EINVAL); } /* Check for extra isize in big inodes. */ if (EXT2_HAS_RO_COMPAT_FEATURE(fs, EXT2F_ROCOMPAT_EXTRA_ISIZE) && EXT2_INODE_SIZE(fs) < sizeof(struct ext2fs_dinode)) { SDT_PROBE1(ext2fs, , vfsops, ext2_compute_sb_data_error, "no space for extra inode timestamps"); return (EINVAL); } /* s_resuid / s_resgid ? */ if (EXT2_HAS_INCOMPAT_FEATURE(fs, EXT2F_INCOMPAT_64BIT)) { e2fs_descpb = fs->e2fs_bsize / E2FS_64BIT_GD_SIZE; e2fs_gdbcount_alloc = howmany(fs->e2fs_gcount, e2fs_descpb); } else { e2fs_descpb = fs->e2fs_bsize / E2FS_REV0_GD_SIZE; e2fs_gdbcount_alloc = howmany(fs->e2fs_gcount, fs->e2fs_bsize / sizeof(struct ext2_gd)); } fs->e2fs_gdbcount = howmany(fs->e2fs_gcount, e2fs_descpb); fs->e2fs_gd = malloc(e2fs_gdbcount_alloc * fs->e2fs_bsize, M_EXT2MNT, M_WAITOK | M_ZERO); fs->e2fs_contigdirs = malloc(fs->e2fs_gcount * sizeof(*fs->e2fs_contigdirs), M_EXT2MNT, M_WAITOK | M_ZERO); for (i = 0; i < fs->e2fs_gdbcount; i++) { error = bread(devvp, fsbtodb(fs, ext2_cg_location(fs, i)), fs->e2fs_bsize, NOCRED, &bp); if (error) { /* * fs->e2fs_gd and fs->e2fs_contigdirs * will be freed later by the caller, * because this function could be called from * MNT_UPDATE path. */ return (error); } if (EXT2_HAS_INCOMPAT_FEATURE(fs, EXT2F_INCOMPAT_64BIT)) { memcpy(&fs->e2fs_gd[ i * fs->e2fs_bsize / sizeof(struct ext2_gd)], bp->b_data, fs->e2fs_bsize); } else { for (j = 0; j < e2fs_descpb && g_count < fs->e2fs_gcount; j++, g_count++) memcpy(&fs->e2fs_gd[g_count], bp->b_data + j * E2FS_REV0_GD_SIZE, E2FS_REV0_GD_SIZE); } brelse(bp); bp = NULL; } /* Validate cgs consistency */ error = ext2_cg_validate(fs); if (error) return (error); /* Verfy cgs csum */ if (EXT2_HAS_RO_COMPAT_FEATURE(fs, EXT2F_ROCOMPAT_GDT_CSUM) || EXT2_HAS_RO_COMPAT_FEATURE(fs, EXT2F_ROCOMPAT_METADATA_CKSUM)) { error = ext2_gd_csum_verify(fs, devvp->v_rdev); if (error) return (error); } /* Initialization for the ext2 Orlov allocator variant. */ fs->e2fs_total_dir = 0; for (i = 0; i < fs->e2fs_gcount; i++) fs->e2fs_total_dir += e2fs_gd_get_ndirs(&fs->e2fs_gd[i]); if (le32toh(es->e2fs_rev) == E2FS_REV0 || !EXT2_HAS_RO_COMPAT_FEATURE(fs, EXT2F_ROCOMPAT_LARGEFILE)) fs->e2fs_maxfilesize = 0x7fffffff; else { fs->e2fs_maxfilesize = 0xffffffffffff; if (EXT2_HAS_RO_COMPAT_FEATURE(fs, EXT2F_ROCOMPAT_HUGE_FILE)) fs->e2fs_maxfilesize = 0x7fffffffffffffff; } if (le32toh(es->e4fs_flags) & E2FS_UNSIGNED_HASH) { fs->e2fs_uhash = 3; } else if ((le32toh(es->e4fs_flags) & E2FS_SIGNED_HASH) == 0) { #ifdef __CHAR_UNSIGNED__ es->e4fs_flags = htole32(le32toh(es->e4fs_flags) | E2FS_UNSIGNED_HASH); fs->e2fs_uhash = 3; #else es->e4fs_flags = htole32(le32toh(es->e4fs_flags) | E2FS_SIGNED_HASH); #endif } if (EXT2_HAS_RO_COMPAT_FEATURE(fs, EXT2F_ROCOMPAT_METADATA_CKSUM)) error = ext2_sb_csum_verify(fs); return (error); } /* * Reload all incore data for a filesystem (used after running fsck on * the root filesystem and finding things to fix). The filesystem must * be mounted read-only. * * Things to do to update the mount: * 1) invalidate all cached meta-data. * 2) re-read superblock from disk. * 3) invalidate all cluster summary information. * 4) invalidate all inactive vnodes. * 5) invalidate all cached file data. * 6) re-read inode data for all active vnodes. * XXX we are missing some steps, in particular # 3, this has to be reviewed. */ static int ext2_reload(struct mount *mp, struct thread *td) { struct vnode *vp, *mvp, *devvp; struct inode *ip; struct buf *bp; struct ext2fs *es; struct m_ext2fs *fs; struct csum *sump; int error, i; int32_t *lp; if ((mp->mnt_flag & MNT_RDONLY) == 0) return (EINVAL); /* * Step 1: invalidate all cached meta-data. */ devvp = VFSTOEXT2(mp)->um_devvp; vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY); if (vinvalbuf(devvp, 0, 0, 0) != 0) panic("ext2_reload: dirty1"); VOP_UNLOCK(devvp); /* * Step 2: re-read superblock from disk. * constants have been adjusted for ext2 */ if ((error = bread(devvp, SBLOCK, SBSIZE, NOCRED, &bp)) != 0) return (error); es = (struct ext2fs *)bp->b_data; if (ext2_check_sb_compat(es, devvp->v_rdev, 0) != 0) { brelse(bp); return (EIO); /* XXX needs translation */ } fs = VFSTOEXT2(mp)->um_e2fs; bcopy(bp->b_data, fs->e2fs, sizeof(struct ext2fs)); if ((error = ext2_compute_sb_data(devvp, es, fs)) != 0) { brelse(bp); return (error); } #ifdef UNKLAR if (fs->fs_sbsize < SBSIZE) bp->b_flags |= B_INVAL; #endif brelse(bp); /* * Step 3: invalidate all cluster summary information. */ if (fs->e2fs_contigsumsize > 0) { lp = fs->e2fs_maxcluster; sump = fs->e2fs_clustersum; for (i = 0; i < fs->e2fs_gcount; i++, sump++) { *lp++ = fs->e2fs_contigsumsize; sump->cs_init = 0; bzero(sump->cs_sum, fs->e2fs_contigsumsize + 1); } } loop: MNT_VNODE_FOREACH_ALL(vp, mp, mvp) { /* * Step 4: invalidate all cached file data. */ if (vget(vp, LK_EXCLUSIVE | LK_INTERLOCK)) { MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp); goto loop; } if (vinvalbuf(vp, 0, 0, 0)) panic("ext2_reload: dirty2"); /* * Step 5: re-read inode data for all active vnodes. */ ip = VTOI(vp); error = bread(devvp, fsbtodb(fs, ino_to_fsba(fs, ip->i_number)), (int)fs->e2fs_bsize, NOCRED, &bp); if (error) { VOP_UNLOCK(vp); vrele(vp); MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp); return (error); } error = ext2_ei2i((struct ext2fs_dinode *)((char *)bp->b_data + EXT2_INODE_SIZE(fs) * ino_to_fsbo(fs, ip->i_number)), ip); brelse(bp); VOP_UNLOCK(vp); vrele(vp); if (error) { MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp); return (error); } } return (0); } /* * Common code for mount and mountroot. */ static int ext2_mountfs(struct vnode *devvp, struct mount *mp) { struct ext2mount *ump; struct buf *bp; struct m_ext2fs *fs; struct ext2fs *es; struct cdev *dev = devvp->v_rdev; struct g_consumer *cp; struct bufobj *bo; struct csum *sump; int error; int ronly; int i; u_long size; int32_t *lp; int32_t e2fs_maxcontig; ronly = vfs_flagopt(mp->mnt_optnew, "ro", NULL, 0); /* XXX: use VOP_ACESS to check FS perms */ g_topology_lock(); error = g_vfs_open(devvp, &cp, "ext2fs", ronly ? 0 : 1); g_topology_unlock(); VOP_UNLOCK(devvp); if (error) return (error); /* XXX: should we check for some sectorsize or 512 instead? */ if (((SBSIZE % cp->provider->sectorsize) != 0) || (SBSIZE < cp->provider->sectorsize)) { g_topology_lock(); g_vfs_close(cp); g_topology_unlock(); return (EINVAL); } bo = &devvp->v_bufobj; bo->bo_private = cp; bo->bo_ops = g_vfs_bufops; if (devvp->v_rdev->si_iosize_max != 0) mp->mnt_iosize_max = devvp->v_rdev->si_iosize_max; if (mp->mnt_iosize_max > MAXPHYS) mp->mnt_iosize_max = MAXPHYS; bp = NULL; ump = NULL; if ((error = bread(devvp, SBLOCK, SBSIZE, NOCRED, &bp)) != 0) goto out; es = (struct ext2fs *)bp->b_data; if (ext2_check_sb_compat(es, dev, ronly) != 0) { error = EINVAL; /* XXX needs translation */ goto out; } if ((le16toh(es->e2fs_state) & E2FS_ISCLEAN) == 0 || (le16toh(es->e2fs_state) & E2FS_ERRORS)) { if (ronly || (mp->mnt_flag & MNT_FORCE)) { printf( "WARNING: Filesystem was not properly dismounted\n"); } else { printf( "WARNING: R/W mount denied. Filesystem is not clean - run fsck\n"); error = EPERM; goto out; } } ump = malloc(sizeof(*ump), M_EXT2MNT, M_WAITOK | M_ZERO); /* * I don't know whether this is the right strategy. Note that * we dynamically allocate both an m_ext2fs and an ext2fs * while Linux keeps the super block in a locked buffer. */ ump->um_e2fs = malloc(sizeof(struct m_ext2fs), M_EXT2MNT, M_WAITOK | M_ZERO); ump->um_e2fs->e2fs = malloc(sizeof(struct ext2fs), M_EXT2MNT, M_WAITOK); mtx_init(EXT2_MTX(ump), "EXT2FS", "EXT2FS Lock", MTX_DEF); bcopy(es, ump->um_e2fs->e2fs, (u_int)sizeof(struct ext2fs)); if ((error = ext2_compute_sb_data(devvp, ump->um_e2fs->e2fs, ump->um_e2fs))) goto out; /* * Calculate the maximum contiguous blocks and size of cluster summary * array. In FFS this is done by newfs; however, the superblock * in ext2fs doesn't have these variables, so we can calculate * them here. */ e2fs_maxcontig = MAX(1, MAXPHYS / ump->um_e2fs->e2fs_bsize); ump->um_e2fs->e2fs_contigsumsize = MIN(e2fs_maxcontig, EXT2_MAXCONTIG); if (ump->um_e2fs->e2fs_contigsumsize > 0) { size = ump->um_e2fs->e2fs_gcount * sizeof(int32_t); ump->um_e2fs->e2fs_maxcluster = malloc(size, M_EXT2MNT, M_WAITOK); size = ump->um_e2fs->e2fs_gcount * sizeof(struct csum); ump->um_e2fs->e2fs_clustersum = malloc(size, M_EXT2MNT, M_WAITOK); lp = ump->um_e2fs->e2fs_maxcluster; sump = ump->um_e2fs->e2fs_clustersum; for (i = 0; i < ump->um_e2fs->e2fs_gcount; i++, sump++) { *lp++ = ump->um_e2fs->e2fs_contigsumsize; sump->cs_init = 0; sump->cs_sum = malloc((ump->um_e2fs->e2fs_contigsumsize + 1) * sizeof(int32_t), M_EXT2MNT, M_WAITOK | M_ZERO); } } brelse(bp); bp = NULL; fs = ump->um_e2fs; fs->e2fs_ronly = ronly; /* ronly is set according to mnt_flags */ /* * If the fs is not mounted read-only, make sure the super block is * always written back on a sync(). */ fs->e2fs_wasvalid = le16toh(fs->e2fs->e2fs_state) & E2FS_ISCLEAN ? 1 : 0; if (ronly == 0) { fs->e2fs_fmod = 1; /* mark it modified and set fs invalid */ fs->e2fs->e2fs_state = htole16(le16toh(fs->e2fs->e2fs_state) & ~E2FS_ISCLEAN); } mp->mnt_data = ump; mp->mnt_stat.f_fsid.val[0] = dev2udev(dev); mp->mnt_stat.f_fsid.val[1] = mp->mnt_vfc->vfc_typenum; mp->mnt_maxsymlinklen = EXT2_MAXSYMLINKLEN; MNT_ILOCK(mp); mp->mnt_flag |= MNT_LOCAL; MNT_IUNLOCK(mp); ump->um_mountp = mp; ump->um_dev = dev; ump->um_devvp = devvp; ump->um_bo = &devvp->v_bufobj; ump->um_cp = cp; /* * Setting those two parameters allowed us to use * ufs_bmap w/o changse! */ ump->um_nindir = EXT2_ADDR_PER_BLOCK(fs); ump->um_bptrtodb = le32toh(fs->e2fs->e2fs_log_bsize) + 1; ump->um_seqinc = EXT2_FRAGS_PER_BLOCK(fs); if (ronly == 0) ext2_sbupdate(ump, MNT_WAIT); /* * Initialize filesystem stat information in mount struct. */ MNT_ILOCK(mp); mp->mnt_kern_flag |= MNTK_LOOKUP_SHARED | MNTK_EXTENDED_SHARED | MNTK_USES_BCACHE; MNT_IUNLOCK(mp); return (0); out: if (bp) brelse(bp); if (cp != NULL) { g_topology_lock(); g_vfs_close(cp); g_topology_unlock(); } if (ump) { mtx_destroy(EXT2_MTX(ump)); free(ump->um_e2fs->e2fs_gd, M_EXT2MNT); free(ump->um_e2fs->e2fs_contigdirs, M_EXT2MNT); free(ump->um_e2fs->e2fs, M_EXT2MNT); free(ump->um_e2fs, M_EXT2MNT); free(ump, M_EXT2MNT); mp->mnt_data = NULL; } return (error); } /* * Unmount system call. */ static int ext2_unmount(struct mount *mp, int mntflags) { struct ext2mount *ump; struct m_ext2fs *fs; struct csum *sump; int error, flags, i, ronly; flags = 0; if (mntflags & MNT_FORCE) { if (mp->mnt_flag & MNT_ROOTFS) return (EINVAL); flags |= FORCECLOSE; } if ((error = ext2_flushfiles(mp, flags, curthread)) != 0) return (error); ump = VFSTOEXT2(mp); fs = ump->um_e2fs; ronly = fs->e2fs_ronly; if (ronly == 0 && ext2_cgupdate(ump, MNT_WAIT) == 0) { if (fs->e2fs_wasvalid) fs->e2fs->e2fs_state = htole16(le16toh(fs->e2fs->e2fs_state) | E2FS_ISCLEAN); ext2_sbupdate(ump, MNT_WAIT); } g_topology_lock(); g_vfs_close(ump->um_cp); g_topology_unlock(); vrele(ump->um_devvp); sump = fs->e2fs_clustersum; for (i = 0; i < fs->e2fs_gcount; i++, sump++) free(sump->cs_sum, M_EXT2MNT); free(fs->e2fs_clustersum, M_EXT2MNT); free(fs->e2fs_maxcluster, M_EXT2MNT); free(fs->e2fs_gd, M_EXT2MNT); free(fs->e2fs_contigdirs, M_EXT2MNT); free(fs->e2fs, M_EXT2MNT); free(fs, M_EXT2MNT); free(ump, M_EXT2MNT); mp->mnt_data = NULL; MNT_ILOCK(mp); mp->mnt_flag &= ~MNT_LOCAL; MNT_IUNLOCK(mp); return (error); } /* * Flush out all the files in a filesystem. */ static int ext2_flushfiles(struct mount *mp, int flags, struct thread *td) { int error; error = vflush(mp, 0, flags, td); return (error); } /* * Get filesystem statistics. */ int ext2_statfs(struct mount *mp, struct statfs *sbp) { struct ext2mount *ump; struct m_ext2fs *fs; uint32_t overhead, overhead_per_group, ngdb; int i, ngroups; ump = VFSTOEXT2(mp); fs = ump->um_e2fs; if (le16toh(fs->e2fs->e2fs_magic) != E2FS_MAGIC) panic("ext2_statfs"); /* * Compute the overhead (FS structures) */ overhead_per_group = 1 /* block bitmap */ + 1 /* inode bitmap */ + fs->e2fs_itpg; overhead = le32toh(fs->e2fs->e2fs_first_dblock) + fs->e2fs_gcount * overhead_per_group; if (le32toh(fs->e2fs->e2fs_rev) > E2FS_REV0 && le32toh(fs->e2fs->e2fs_features_rocompat) & EXT2F_ROCOMPAT_SPARSESUPER) { for (i = 0, ngroups = 0; i < fs->e2fs_gcount; i++) { if (ext2_cg_has_sb(fs, i)) ngroups++; } } else { ngroups = fs->e2fs_gcount; } ngdb = fs->e2fs_gdbcount; if (le32toh(fs->e2fs->e2fs_rev) > E2FS_REV0 && le32toh(fs->e2fs->e2fs_features_compat) & EXT2F_COMPAT_RESIZE) ngdb += le16toh(fs->e2fs->e2fs_reserved_ngdb); overhead += ngroups * (1 /* superblock */ + ngdb); sbp->f_bsize = EXT2_FRAG_SIZE(fs); sbp->f_iosize = EXT2_BLOCK_SIZE(fs); sbp->f_blocks = fs->e2fs_bcount - overhead; sbp->f_bfree = fs->e2fs_fbcount; sbp->f_bavail = sbp->f_bfree - fs->e2fs_rbcount; sbp->f_files = le32toh(fs->e2fs->e2fs_icount); sbp->f_ffree = fs->e2fs_ficount; return (0); } /* * Go through the disk queues to initiate sandbagged IO; * go through the inodes to write those that have been modified; * initiate the writing of the super block if it has been modified. * * Note: we are always called with the filesystem marked `MPBUSY'. */ static int ext2_sync(struct mount *mp, int waitfor) { struct vnode *mvp, *vp; struct thread *td; struct inode *ip; struct ext2mount *ump = VFSTOEXT2(mp); struct m_ext2fs *fs; int error, allerror = 0; td = curthread; fs = ump->um_e2fs; if (fs->e2fs_fmod != 0 && fs->e2fs_ronly != 0) { /* XXX */ panic("ext2_sync: rofs mod fs=%s", fs->e2fs_fsmnt); } /* * Write back each (modified) inode. */ loop: MNT_VNODE_FOREACH_ALL(vp, mp, mvp) { if (vp->v_type == VNON) { VI_UNLOCK(vp); continue; } ip = VTOI(vp); if ((ip->i_flag & (IN_ACCESS | IN_CHANGE | IN_MODIFIED | IN_UPDATE)) == 0 && (vp->v_bufobj.bo_dirty.bv_cnt == 0 || waitfor == MNT_LAZY)) { VI_UNLOCK(vp); continue; } error = vget(vp, LK_EXCLUSIVE | LK_NOWAIT | LK_INTERLOCK); if (error) { if (error == ENOENT) { MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp); goto loop; } continue; } if ((error = VOP_FSYNC(vp, waitfor, td)) != 0) allerror = error; VOP_UNLOCK(vp); vrele(vp); } /* * Force stale filesystem control information to be flushed. */ if (waitfor != MNT_LAZY) { vn_lock(ump->um_devvp, LK_EXCLUSIVE | LK_RETRY); if ((error = VOP_FSYNC(ump->um_devvp, waitfor, td)) != 0) allerror = error; VOP_UNLOCK(ump->um_devvp); } /* * Write back modified superblock. */ if (fs->e2fs_fmod != 0) { fs->e2fs_fmod = 0; fs->e2fs->e2fs_wtime = htole32(time_second); if ((error = ext2_cgupdate(ump, waitfor)) != 0) allerror = error; } return (allerror); } /* * Look up an EXT2FS dinode number to find its incore vnode, otherwise read it * in from disk. If it is in core, wait for the lock bit to clear, then * return the inode locked. Detection and handling of mount points must be * done by the calling routine. */ static int ext2_vget(struct mount *mp, ino_t ino, int flags, struct vnode **vpp) { struct m_ext2fs *fs; struct inode *ip; struct ext2mount *ump; struct buf *bp; struct vnode *vp; struct thread *td; unsigned int i, used_blocks; int error; td = curthread; error = vfs_hash_get(mp, ino, flags, td, vpp, NULL, NULL); if (error || *vpp != NULL) return (error); ump = VFSTOEXT2(mp); ip = malloc(sizeof(struct inode), M_EXT2NODE, M_WAITOK | M_ZERO); /* Allocate a new vnode/inode. */ if ((error = getnewvnode("ext2fs", mp, &ext2_vnodeops, &vp)) != 0) { *vpp = NULL; free(ip, M_EXT2NODE); return (error); } vp->v_data = ip; ip->i_vnode = vp; ip->i_e2fs = fs = ump->um_e2fs; ip->i_ump = ump; ip->i_number = ino; lockmgr(vp->v_vnlock, LK_EXCLUSIVE, NULL); error = insmntque(vp, mp); if (error != 0) { free(ip, M_EXT2NODE); *vpp = NULL; return (error); } error = vfs_hash_insert(vp, ino, flags, td, vpp, NULL, NULL); if (error || *vpp != NULL) return (error); /* Read in the disk contents for the inode, copy into the inode. */ if ((error = bread(ump->um_devvp, fsbtodb(fs, ino_to_fsba(fs, ino)), (int)fs->e2fs_bsize, NOCRED, &bp)) != 0) { /* * The inode does not contain anything useful, so it would * be misleading to leave it on its hash chain. With mode * still zero, it will be unlinked and returned to the free * list by vput(). */ brelse(bp); vput(vp); *vpp = NULL; return (error); } /* convert ext2 inode to dinode */ error = ext2_ei2i((struct ext2fs_dinode *)((char *)bp->b_data + EXT2_INODE_SIZE(fs) * ino_to_fsbo(fs, ino)), ip); if (error) { brelse(bp); vput(vp); *vpp = NULL; return (error); } ip->i_block_group = ino_to_cg(fs, ino); ip->i_next_alloc_block = 0; ip->i_next_alloc_goal = 0; /* * Now we want to make sure that block pointers for unused * blocks are zeroed out - ext2_balloc depends on this * although for regular files and directories only * * If IN_E4EXTENTS is enabled, unused blocks are not zeroed * out because we could corrupt the extent tree. */ if (!(ip->i_flag & IN_E4EXTENTS) && (S_ISDIR(ip->i_mode) || S_ISREG(ip->i_mode))) { used_blocks = howmany(ip->i_size, fs->e2fs_bsize); for (i = used_blocks; i < EXT2_NDIR_BLOCKS; i++) ip->i_db[i] = 0; } #ifdef EXT2FS_PRINT_EXTENTS ext2_print_inode(ip); ext4_ext_print_extent_tree_status(ip); #endif bqrelse(bp); /* * Initialize the vnode from the inode, check for aliases. * Note that the underlying vnode may have changed. */ if ((error = ext2_vinit(mp, &ext2_fifoops, &vp)) != 0) { vput(vp); *vpp = NULL; return (error); } /* * Finish inode initialization. */ *vpp = vp; return (0); } /* * File handle to vnode * * Have to be really careful about stale file handles: * - check that the inode number is valid * - call ext2_vget() to get the locked inode * - check for an unallocated inode (i_mode == 0) * - check that the given client host has export rights and return * those rights via. exflagsp and credanonp */ static int ext2_fhtovp(struct mount *mp, struct fid *fhp, int flags, struct vnode **vpp) { struct inode *ip; struct ufid *ufhp; struct vnode *nvp; struct m_ext2fs *fs; int error; ufhp = (struct ufid *)fhp; fs = VFSTOEXT2(mp)->um_e2fs; if (ufhp->ufid_ino < EXT2_ROOTINO || ufhp->ufid_ino > fs->e2fs_gcount * fs->e2fs_ipg) return (ESTALE); error = VFS_VGET(mp, ufhp->ufid_ino, LK_EXCLUSIVE, &nvp); if (error) { *vpp = NULLVP; return (error); } ip = VTOI(nvp); if (ip->i_mode == 0 || ip->i_gen != ufhp->ufid_gen || ip->i_nlink <= 0) { vput(nvp); *vpp = NULLVP; return (ESTALE); } *vpp = nvp; vnode_create_vobject(*vpp, 0, curthread); return (0); } /* * Write a superblock and associated information back to disk. */ static int ext2_sbupdate(struct ext2mount *mp, int waitfor) { struct m_ext2fs *fs = mp->um_e2fs; struct ext2fs *es = fs->e2fs; struct buf *bp; int error = 0; es->e2fs_bcount = htole32(fs->e2fs_bcount & 0xffffffff); es->e2fs_rbcount = htole32(fs->e2fs_rbcount & 0xffffffff); es->e2fs_fbcount = htole32(fs->e2fs_fbcount & 0xffffffff); if (EXT2_HAS_INCOMPAT_FEATURE(fs, EXT2F_INCOMPAT_64BIT)) { es->e4fs_bcount_hi = htole32(fs->e2fs_bcount >> 32); es->e4fs_rbcount_hi = htole32(fs->e2fs_rbcount >> 32); es->e4fs_fbcount_hi = htole32(fs->e2fs_fbcount >> 32); } es->e2fs_ficount = htole32(fs->e2fs_ficount); if (EXT2_HAS_RO_COMPAT_FEATURE(fs, EXT2F_ROCOMPAT_METADATA_CKSUM)) ext2_sb_csum_set(fs); bp = getblk(mp->um_devvp, SBLOCK, SBSIZE, 0, 0, 0); bcopy((caddr_t)es, bp->b_data, (u_int)sizeof(struct ext2fs)); if (waitfor == MNT_WAIT) error = bwrite(bp); else bawrite(bp); /* * The buffers for group descriptors, inode bitmaps and block bitmaps * are not busy at this point and are (hopefully) written by the * usual sync mechanism. No need to write them here. */ return (error); } int ext2_cgupdate(struct ext2mount *mp, int waitfor) { struct m_ext2fs *fs = mp->um_e2fs; struct buf *bp; int i, j, g_count = 0, error = 0, allerror = 0; allerror = ext2_sbupdate(mp, waitfor); /* Update gd csums */ if (EXT2_HAS_RO_COMPAT_FEATURE(fs, EXT2F_ROCOMPAT_GDT_CSUM) || EXT2_HAS_RO_COMPAT_FEATURE(fs, EXT2F_ROCOMPAT_METADATA_CKSUM)) ext2_gd_csum_set(fs); for (i = 0; i < fs->e2fs_gdbcount; i++) { bp = getblk(mp->um_devvp, fsbtodb(fs, ext2_cg_location(fs, i)), fs->e2fs_bsize, 0, 0, 0); if (EXT2_HAS_INCOMPAT_FEATURE(fs, EXT2F_INCOMPAT_64BIT)) { memcpy(bp->b_data, &fs->e2fs_gd[ i * fs->e2fs_bsize / sizeof(struct ext2_gd)], fs->e2fs_bsize); } else { for (j = 0; j < fs->e2fs_bsize / E2FS_REV0_GD_SIZE && g_count < fs->e2fs_gcount; j++, g_count++) memcpy(bp->b_data + j * E2FS_REV0_GD_SIZE, &fs->e2fs_gd[g_count], E2FS_REV0_GD_SIZE); } if (waitfor == MNT_WAIT) error = bwrite(bp); else bawrite(bp); } if (!allerror && error) allerror = error; return (allerror); } /* * Return the root of a filesystem. */ static int ext2_root(struct mount *mp, int flags, struct vnode **vpp) { struct vnode *nvp; int error; error = VFS_VGET(mp, EXT2_ROOTINO, LK_EXCLUSIVE, &nvp); if (error) return (error); *vpp = nvp; return (0); } Index: head/sys/fs/msdosfs/msdosfs_vfsops.c =================================================================== --- head/sys/fs/msdosfs/msdosfs_vfsops.c (revision 364371) +++ head/sys/fs/msdosfs/msdosfs_vfsops.c (revision 364372) @@ -1,985 +1,985 @@ /* $FreeBSD$ */ /* $NetBSD: msdosfs_vfsops.c,v 1.51 1997/11/17 15:36:58 ws Exp $ */ /*- * SPDX-License-Identifier: BSD-4-Clause * * Copyright (C) 1994, 1995, 1997 Wolfgang Solfrank. * Copyright (C) 1994, 1995, 1997 TooLs GmbH. * All rights reserved. * Original code by Paul Popelka (paulp@uts.amdahl.com) (see below). * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by TooLs GmbH. * 4. The name of TooLs GmbH may not be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY TOOLS GMBH ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL TOOLS GMBH BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /*- * Written by Paul Popelka (paulp@uts.amdahl.com) * * You can do anything you want with this software, just don't say you wrote * it, and don't remove this notice. * * This software is provided "as is". * * The author supplies this software to be publicly redistributed on the * understanding that the author is not responsible for the correct * functioning of this software in any circumstances and is not liable for * any damages caused by this software. * * October 1992 */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef MSDOSFS_DEBUG #include #endif static const char msdosfs_lock_msg[] = "fatlk"; /* Mount options that we support. */ static const char *msdosfs_opts[] = { "async", "noatime", "noclusterr", "noclusterw", "export", "force", "from", "sync", "cs_dos", "cs_local", "cs_win", "dirmask", "gid", "kiconv", "longname", "longnames", "mask", "shortname", "shortnames", "uid", "win95", "nowin95", NULL }; #if 1 /*def PC98*/ /* * XXX - The boot signature formatted by NEC PC-98 DOS looks like a * garbage or a random value :-{ * If you want to use that broken-signatured media, define the * following symbol even though PC/AT. * (ex. mount PC-98 DOS formatted FD on PC/AT) */ #define MSDOSFS_NOCHECKSIG #endif MALLOC_DEFINE(M_MSDOSFSMNT, "msdosfs_mount", "MSDOSFS mount structure"); static MALLOC_DEFINE(M_MSDOSFSFAT, "msdosfs_fat", "MSDOSFS file allocation table"); struct iconv_functions *msdosfs_iconv; static int update_mp(struct mount *mp, struct thread *td); static int mountmsdosfs(struct vnode *devvp, struct mount *mp); static vfs_fhtovp_t msdosfs_fhtovp; static vfs_mount_t msdosfs_mount; static vfs_root_t msdosfs_root; static vfs_statfs_t msdosfs_statfs; static vfs_sync_t msdosfs_sync; static vfs_unmount_t msdosfs_unmount; /* Maximum length of a character set name (arbitrary). */ #define MAXCSLEN 64 static int update_mp(struct mount *mp, struct thread *td) { struct msdosfsmount *pmp = VFSTOMSDOSFS(mp); void *dos, *win, *local; int error, v; if (!vfs_getopt(mp->mnt_optnew, "kiconv", NULL, NULL)) { if (msdosfs_iconv != NULL) { error = vfs_getopt(mp->mnt_optnew, "cs_win", &win, NULL); if (!error) error = vfs_getopt(mp->mnt_optnew, "cs_local", &local, NULL); if (!error) error = vfs_getopt(mp->mnt_optnew, "cs_dos", &dos, NULL); if (!error) { msdosfs_iconv->open(win, local, &pmp->pm_u2w); msdosfs_iconv->open(local, win, &pmp->pm_w2u); msdosfs_iconv->open(dos, local, &pmp->pm_u2d); msdosfs_iconv->open(local, dos, &pmp->pm_d2u); } if (error != 0) return (error); } else { pmp->pm_w2u = NULL; pmp->pm_u2w = NULL; pmp->pm_d2u = NULL; pmp->pm_u2d = NULL; } } if (vfs_scanopt(mp->mnt_optnew, "gid", "%d", &v) == 1) pmp->pm_gid = v; if (vfs_scanopt(mp->mnt_optnew, "uid", "%d", &v) == 1) pmp->pm_uid = v; if (vfs_scanopt(mp->mnt_optnew, "mask", "%d", &v) == 1) pmp->pm_mask = v & ALLPERMS; if (vfs_scanopt(mp->mnt_optnew, "dirmask", "%d", &v) == 1) pmp->pm_dirmask = v & ALLPERMS; vfs_flagopt(mp->mnt_optnew, "shortname", &pmp->pm_flags, MSDOSFSMNT_SHORTNAME); vfs_flagopt(mp->mnt_optnew, "shortnames", &pmp->pm_flags, MSDOSFSMNT_SHORTNAME); vfs_flagopt(mp->mnt_optnew, "longname", &pmp->pm_flags, MSDOSFSMNT_LONGNAME); vfs_flagopt(mp->mnt_optnew, "longnames", &pmp->pm_flags, MSDOSFSMNT_LONGNAME); vfs_flagopt(mp->mnt_optnew, "kiconv", &pmp->pm_flags, MSDOSFSMNT_KICONV); if (vfs_getopt(mp->mnt_optnew, "nowin95", NULL, NULL) == 0) pmp->pm_flags |= MSDOSFSMNT_NOWIN95; else pmp->pm_flags &= ~MSDOSFSMNT_NOWIN95; if (pmp->pm_flags & MSDOSFSMNT_NOWIN95) pmp->pm_flags |= MSDOSFSMNT_SHORTNAME; else pmp->pm_flags |= MSDOSFSMNT_LONGNAME; return 0; } static int msdosfs_cmount(struct mntarg *ma, void *data, uint64_t flags) { struct msdosfs_args args; int error; if (data == NULL) return (EINVAL); error = copyin(data, &args, sizeof args); if (error) return (error); ma = mount_argsu(ma, "from", args.fspec, MAXPATHLEN); ma = mount_arg(ma, "export", &args.export, sizeof(args.export)); ma = mount_argf(ma, "uid", "%d", args.uid); ma = mount_argf(ma, "gid", "%d", args.gid); ma = mount_argf(ma, "mask", "%d", args.mask); ma = mount_argf(ma, "dirmask", "%d", args.dirmask); ma = mount_argb(ma, args.flags & MSDOSFSMNT_SHORTNAME, "noshortname"); ma = mount_argb(ma, args.flags & MSDOSFSMNT_LONGNAME, "nolongname"); ma = mount_argb(ma, !(args.flags & MSDOSFSMNT_NOWIN95), "nowin95"); ma = mount_argb(ma, args.flags & MSDOSFSMNT_KICONV, "nokiconv"); ma = mount_argsu(ma, "cs_win", args.cs_win, MAXCSLEN); ma = mount_argsu(ma, "cs_dos", args.cs_dos, MAXCSLEN); ma = mount_argsu(ma, "cs_local", args.cs_local, MAXCSLEN); error = kernel_mount(ma, flags); return (error); } /* * mp - path - addr in user space of mount point (ie /usr or whatever) * data - addr in user space of mount params including the name of the block * special file to treat as a filesystem. */ static int msdosfs_mount(struct mount *mp) { struct vnode *devvp; /* vnode for blk device to mount */ struct thread *td; /* msdosfs specific mount control block */ struct msdosfsmount *pmp = NULL; struct nameidata ndp; int error, flags; accmode_t accmode; char *from; td = curthread; if (vfs_filteropt(mp->mnt_optnew, msdosfs_opts)) return (EINVAL); /* * If updating, check whether changing from read-only to * read/write; if there is no device name, that's all we do. */ if (mp->mnt_flag & MNT_UPDATE) { pmp = VFSTOMSDOSFS(mp); if (!(pmp->pm_flags & MSDOSFSMNT_RONLY) && vfs_flagopt(mp->mnt_optnew, "ro", NULL, 0)) { error = VFS_SYNC(mp, MNT_WAIT); if (error) return (error); flags = WRITECLOSE; if (mp->mnt_flag & MNT_FORCE) flags |= FORCECLOSE; error = vflush(mp, 0, flags, td); if (error) return (error); /* * Now the volume is clean. Mark it so while the * device is still rw. */ error = markvoldirty(pmp, 0); if (error) { (void)markvoldirty(pmp, 1); return (error); } /* Downgrade the device from rw to ro. */ g_topology_lock(); error = g_access(pmp->pm_cp, 0, -1, 0); g_topology_unlock(); if (error) { (void)markvoldirty(pmp, 1); return (error); } /* * Backing out after an error was painful in the * above. Now we are committed to succeeding. */ pmp->pm_fmod = 0; pmp->pm_flags |= MSDOSFSMNT_RONLY; MNT_ILOCK(mp); mp->mnt_flag |= MNT_RDONLY; MNT_IUNLOCK(mp); } else if ((pmp->pm_flags & MSDOSFSMNT_RONLY) && !vfs_flagopt(mp->mnt_optnew, "ro", NULL, 0)) { /* * If upgrade to read-write by non-root, then verify * that user has necessary permissions on the device. */ devvp = pmp->pm_devvp; vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY); error = VOP_ACCESS(devvp, VREAD | VWRITE, td->td_ucred, td); if (error) error = priv_check(td, PRIV_VFS_MOUNT_PERM); if (error) { VOP_UNLOCK(devvp); return (error); } VOP_UNLOCK(devvp); g_topology_lock(); error = g_access(pmp->pm_cp, 0, 1, 0); g_topology_unlock(); if (error) return (error); /* Now that the volume is modifiable, mark it dirty. */ error = markvoldirty_upgrade(pmp, true, true); if (error) { /* * If dirtying the superblock failed, drop GEOM * 'w' refs (we're still RO). */ g_topology_lock(); (void)g_access(pmp->pm_cp, 0, -1, 0); g_topology_unlock(); return (error); } pmp->pm_fmod = 1; pmp->pm_flags &= ~MSDOSFSMNT_RONLY; MNT_ILOCK(mp); mp->mnt_flag &= ~MNT_RDONLY; MNT_IUNLOCK(mp); } } /* * Not an update, or updating the name: look up the name * and verify that it refers to a sensible disk device. */ if (vfs_getopt(mp->mnt_optnew, "from", (void **)&from, NULL)) return (EINVAL); NDINIT(&ndp, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, from, td); error = namei(&ndp); if (error) return (error); devvp = ndp.ni_vp; NDFREE(&ndp, NDF_ONLY_PNBUF); - if (!vn_isdisk(devvp, &error)) { + if (!vn_isdisk_error(devvp, &error)) { vput(devvp); return (error); } /* * If mount by non-root, then verify that user has necessary * permissions on the device. */ accmode = VREAD; if ((mp->mnt_flag & MNT_RDONLY) == 0) accmode |= VWRITE; error = VOP_ACCESS(devvp, accmode, td->td_ucred, td); if (error) error = priv_check(td, PRIV_VFS_MOUNT_PERM); if (error) { vput(devvp); return (error); } if ((mp->mnt_flag & MNT_UPDATE) == 0) { error = mountmsdosfs(devvp, mp); #ifdef MSDOSFS_DEBUG /* only needed for the printf below */ pmp = VFSTOMSDOSFS(mp); #endif } else { vput(devvp); if (devvp != pmp->pm_devvp) return (EINVAL); /* XXX needs translation */ } if (error) { vrele(devvp); return (error); } error = update_mp(mp, td); if (error) { if ((mp->mnt_flag & MNT_UPDATE) == 0) msdosfs_unmount(mp, MNT_FORCE); return error; } vfs_mountedfrom(mp, from); #ifdef MSDOSFS_DEBUG printf("msdosfs_mount(): mp %p, pmp %p, inusemap %p\n", mp, pmp, pmp->pm_inusemap); #endif return (0); } static int mountmsdosfs(struct vnode *devvp, struct mount *mp) { struct msdosfsmount *pmp; struct buf *bp; struct cdev *dev; union bootsector *bsp; struct byte_bpb33 *b33; struct byte_bpb50 *b50; struct byte_bpb710 *b710; uint8_t SecPerClust; u_long clusters; int ronly, error; struct g_consumer *cp; struct bufobj *bo; bp = NULL; /* This and pmp both used in error_exit. */ pmp = NULL; ronly = (mp->mnt_flag & MNT_RDONLY) != 0; dev = devvp->v_rdev; if (atomic_cmpset_acq_ptr((uintptr_t *)&dev->si_mountpt, 0, (uintptr_t)mp) == 0) { VOP_UNLOCK(devvp); return (EBUSY); } g_topology_lock(); error = g_vfs_open(devvp, &cp, "msdosfs", ronly ? 0 : 1); g_topology_unlock(); if (error != 0) { atomic_store_rel_ptr((uintptr_t *)&dev->si_mountpt, 0); VOP_UNLOCK(devvp); return (error); } dev_ref(dev); bo = &devvp->v_bufobj; VOP_UNLOCK(devvp); if (dev->si_iosize_max != 0) mp->mnt_iosize_max = dev->si_iosize_max; if (mp->mnt_iosize_max > MAXPHYS) mp->mnt_iosize_max = MAXPHYS; /* * Read the boot sector of the filesystem, and then check the * boot signature. If not a dos boot sector then error out. * * NOTE: 8192 is a magic size that works for ffs. */ error = bread(devvp, 0, 8192, NOCRED, &bp); if (error) goto error_exit; bp->b_flags |= B_AGE; bsp = (union bootsector *)bp->b_data; b33 = (struct byte_bpb33 *)bsp->bs33.bsBPB; b50 = (struct byte_bpb50 *)bsp->bs50.bsBPB; b710 = (struct byte_bpb710 *)bsp->bs710.bsBPB; #ifndef MSDOSFS_NOCHECKSIG if (bsp->bs50.bsBootSectSig0 != BOOTSIG0 || bsp->bs50.bsBootSectSig1 != BOOTSIG1) { error = EINVAL; goto error_exit; } #endif pmp = malloc(sizeof *pmp, M_MSDOSFSMNT, M_WAITOK | M_ZERO); pmp->pm_mountp = mp; pmp->pm_cp = cp; pmp->pm_bo = bo; lockinit(&pmp->pm_fatlock, 0, msdosfs_lock_msg, 0, 0); /* * Initialize ownerships and permissions, since nothing else will * initialize them iff we are mounting root. */ pmp->pm_uid = UID_ROOT; pmp->pm_gid = GID_WHEEL; pmp->pm_mask = pmp->pm_dirmask = S_IXUSR | S_IXGRP | S_IXOTH | S_IRUSR | S_IRGRP | S_IROTH | S_IWUSR; /* * Compute several useful quantities from the bpb in the * bootsector. Copy in the dos 5 variant of the bpb then fix up * the fields that are different between dos 5 and dos 3.3. */ SecPerClust = b50->bpbSecPerClust; pmp->pm_BytesPerSec = getushort(b50->bpbBytesPerSec); if (pmp->pm_BytesPerSec < DEV_BSIZE) { error = EINVAL; goto error_exit; } pmp->pm_ResSectors = getushort(b50->bpbResSectors); pmp->pm_FATs = b50->bpbFATs; pmp->pm_RootDirEnts = getushort(b50->bpbRootDirEnts); pmp->pm_Sectors = getushort(b50->bpbSectors); pmp->pm_FATsecs = getushort(b50->bpbFATsecs); pmp->pm_SecPerTrack = getushort(b50->bpbSecPerTrack); pmp->pm_Heads = getushort(b50->bpbHeads); pmp->pm_Media = b50->bpbMedia; /* calculate the ratio of sector size to DEV_BSIZE */ pmp->pm_BlkPerSec = pmp->pm_BytesPerSec / DEV_BSIZE; /* * We don't check pm_Heads nor pm_SecPerTrack, because * these may not be set for EFI file systems. We don't * use these anyway, so we're unaffected if they are * invalid. */ if (!pmp->pm_BytesPerSec || !SecPerClust) { error = EINVAL; goto error_exit; } if (pmp->pm_Sectors == 0) { pmp->pm_HiddenSects = getulong(b50->bpbHiddenSecs); pmp->pm_HugeSectors = getulong(b50->bpbHugeSectors); } else { pmp->pm_HiddenSects = getushort(b33->bpbHiddenSecs); pmp->pm_HugeSectors = pmp->pm_Sectors; } if (pmp->pm_RootDirEnts == 0) { if (pmp->pm_FATsecs || getushort(b710->bpbFSVers)) { error = EINVAL; #ifdef MSDOSFS_DEBUG printf("mountmsdosfs(): bad FAT32 filesystem\n"); #endif goto error_exit; } pmp->pm_fatmask = FAT32_MASK; pmp->pm_fatmult = 4; pmp->pm_fatdiv = 1; pmp->pm_FATsecs = getulong(b710->bpbBigFATsecs); if (getushort(b710->bpbExtFlags) & FATMIRROR) pmp->pm_curfat = getushort(b710->bpbExtFlags) & FATNUM; else pmp->pm_flags |= MSDOSFS_FATMIRROR; } else pmp->pm_flags |= MSDOSFS_FATMIRROR; /* * Check a few values (could do some more): * - logical sector size: power of 2, >= block size * - sectors per cluster: power of 2, >= 1 * - number of sectors: >= 1, <= size of partition * - number of FAT sectors: >= 1 */ if ( (SecPerClust == 0) || (SecPerClust & (SecPerClust - 1)) || (pmp->pm_BytesPerSec < DEV_BSIZE) || (pmp->pm_BytesPerSec & (pmp->pm_BytesPerSec - 1)) || (pmp->pm_HugeSectors == 0) || (pmp->pm_FATsecs == 0) || (SecPerClust * pmp->pm_BlkPerSec > MAXBSIZE / DEV_BSIZE) ) { error = EINVAL; goto error_exit; } pmp->pm_HugeSectors *= pmp->pm_BlkPerSec; pmp->pm_HiddenSects *= pmp->pm_BlkPerSec; /* XXX not used? */ pmp->pm_FATsecs *= pmp->pm_BlkPerSec; SecPerClust *= pmp->pm_BlkPerSec; pmp->pm_fatblk = pmp->pm_ResSectors * pmp->pm_BlkPerSec; if (FAT32(pmp)) { pmp->pm_rootdirblk = getulong(b710->bpbRootClust); pmp->pm_firstcluster = pmp->pm_fatblk + (pmp->pm_FATs * pmp->pm_FATsecs); pmp->pm_fsinfo = getushort(b710->bpbFSInfo) * pmp->pm_BlkPerSec; } else { pmp->pm_rootdirblk = pmp->pm_fatblk + (pmp->pm_FATs * pmp->pm_FATsecs); pmp->pm_rootdirsize = howmany(pmp->pm_RootDirEnts * sizeof(struct direntry), DEV_BSIZE); /* in blocks */ pmp->pm_firstcluster = pmp->pm_rootdirblk + pmp->pm_rootdirsize; } pmp->pm_maxcluster = (pmp->pm_HugeSectors - pmp->pm_firstcluster) / SecPerClust + 1; pmp->pm_fatsize = pmp->pm_FATsecs * DEV_BSIZE; /* XXX not used? */ if (pmp->pm_fatmask == 0) { if (pmp->pm_maxcluster <= ((CLUST_RSRVD - CLUST_FIRST) & FAT12_MASK)) { /* * This will usually be a floppy disk. This size makes * sure that one FAT entry will not be split across * multiple blocks. */ pmp->pm_fatmask = FAT12_MASK; pmp->pm_fatmult = 3; pmp->pm_fatdiv = 2; } else { pmp->pm_fatmask = FAT16_MASK; pmp->pm_fatmult = 2; pmp->pm_fatdiv = 1; } } clusters = (pmp->pm_fatsize / pmp->pm_fatmult) * pmp->pm_fatdiv; if (pmp->pm_maxcluster >= clusters) { #ifdef MSDOSFS_DEBUG printf("Warning: number of clusters (%ld) exceeds FAT " "capacity (%ld)\n", pmp->pm_maxcluster + 1, clusters); #endif pmp->pm_maxcluster = clusters - 1; } if (FAT12(pmp)) pmp->pm_fatblocksize = 3 * 512; else pmp->pm_fatblocksize = PAGE_SIZE; pmp->pm_fatblocksize = roundup(pmp->pm_fatblocksize, pmp->pm_BytesPerSec); pmp->pm_fatblocksec = pmp->pm_fatblocksize / DEV_BSIZE; pmp->pm_bnshift = ffs(DEV_BSIZE) - 1; /* * Compute mask and shift value for isolating cluster relative byte * offsets and cluster numbers from a file offset. */ pmp->pm_bpcluster = SecPerClust * DEV_BSIZE; pmp->pm_crbomask = pmp->pm_bpcluster - 1; pmp->pm_cnshift = ffs(pmp->pm_bpcluster) - 1; /* * Check for valid cluster size * must be a power of 2 */ if (pmp->pm_bpcluster ^ (1 << pmp->pm_cnshift)) { error = EINVAL; goto error_exit; } /* * Release the bootsector buffer. */ brelse(bp); bp = NULL; /* * Check the fsinfo sector if we have one. Silently fix up our * in-core copy of fp->fsinxtfree if it is unknown (0xffffffff) * or too large. Ignore fp->fsinfree for now, since we need to * read the entire FAT anyway to fill the inuse map. */ if (pmp->pm_fsinfo) { struct fsinfo *fp; if ((error = bread(devvp, pmp->pm_fsinfo, pmp->pm_BytesPerSec, NOCRED, &bp)) != 0) goto error_exit; fp = (struct fsinfo *)bp->b_data; if (!bcmp(fp->fsisig1, "RRaA", 4) && !bcmp(fp->fsisig2, "rrAa", 4) && !bcmp(fp->fsisig3, "\0\0\125\252", 4)) { pmp->pm_nxtfree = getulong(fp->fsinxtfree); if (pmp->pm_nxtfree > pmp->pm_maxcluster) pmp->pm_nxtfree = CLUST_FIRST; } else pmp->pm_fsinfo = 0; brelse(bp); bp = NULL; } /* * Finish initializing pmp->pm_nxtfree (just in case the first few * sectors aren't properly reserved in the FAT). This completes * the fixup for fp->fsinxtfree, and fixes up the zero-initialized * value if there is no fsinfo. We will use pmp->pm_nxtfree * internally even if there is no fsinfo. */ if (pmp->pm_nxtfree < CLUST_FIRST) pmp->pm_nxtfree = CLUST_FIRST; /* * Allocate memory for the bitmap of allocated clusters, and then * fill it in. */ pmp->pm_inusemap = malloc(howmany(pmp->pm_maxcluster + 1, N_INUSEBITS) * sizeof(*pmp->pm_inusemap), M_MSDOSFSFAT, M_WAITOK); /* * fillinusemap() needs pm_devvp. */ pmp->pm_devvp = devvp; pmp->pm_dev = dev; /* * Have the inuse map filled in. */ MSDOSFS_LOCK_MP(pmp); error = fillinusemap(pmp); MSDOSFS_UNLOCK_MP(pmp); if (error != 0) goto error_exit; /* * If they want FAT updates to be synchronous then let them suffer * the performance degradation in exchange for the on disk copy of * the FAT being correct just about all the time. I suppose this * would be a good thing to turn on if the kernel is still flakey. */ if (mp->mnt_flag & MNT_SYNCHRONOUS) pmp->pm_flags |= MSDOSFSMNT_WAITONFAT; /* * Finish up. */ if (ronly) pmp->pm_flags |= MSDOSFSMNT_RONLY; else { if ((error = markvoldirty(pmp, 1)) != 0) goto error_exit; pmp->pm_fmod = 1; } mp->mnt_data = pmp; mp->mnt_stat.f_fsid.val[0] = dev2udev(dev); mp->mnt_stat.f_fsid.val[1] = mp->mnt_vfc->vfc_typenum; MNT_ILOCK(mp); mp->mnt_flag |= MNT_LOCAL; mp->mnt_kern_flag |= MNTK_USES_BCACHE | MNTK_NO_IOPF; MNT_IUNLOCK(mp); return (0); error_exit: if (bp) brelse(bp); if (cp != NULL) { g_topology_lock(); g_vfs_close(cp); g_topology_unlock(); } if (pmp) { lockdestroy(&pmp->pm_fatlock); free(pmp->pm_inusemap, M_MSDOSFSFAT); free(pmp, M_MSDOSFSMNT); mp->mnt_data = NULL; } atomic_store_rel_ptr((uintptr_t *)&dev->si_mountpt, 0); dev_rel(dev); return (error); } /* * Unmount the filesystem described by mp. */ static int msdosfs_unmount(struct mount *mp, int mntflags) { struct msdosfsmount *pmp; int error, flags; error = flags = 0; pmp = VFSTOMSDOSFS(mp); if ((pmp->pm_flags & MSDOSFSMNT_RONLY) == 0) error = msdosfs_sync(mp, MNT_WAIT); if ((mntflags & MNT_FORCE) != 0) flags |= FORCECLOSE; else if (error != 0) return (error); error = vflush(mp, 0, flags, curthread); if (error != 0 && error != ENXIO) return (error); if ((pmp->pm_flags & MSDOSFSMNT_RONLY) == 0) { error = markvoldirty(pmp, 0); if (error && error != ENXIO) { (void)markvoldirty(pmp, 1); return (error); } } if (pmp->pm_flags & MSDOSFSMNT_KICONV && msdosfs_iconv) { if (pmp->pm_w2u) msdosfs_iconv->close(pmp->pm_w2u); if (pmp->pm_u2w) msdosfs_iconv->close(pmp->pm_u2w); if (pmp->pm_d2u) msdosfs_iconv->close(pmp->pm_d2u); if (pmp->pm_u2d) msdosfs_iconv->close(pmp->pm_u2d); } #ifdef MSDOSFS_DEBUG { struct vnode *vp = pmp->pm_devvp; struct bufobj *bo; bo = &vp->v_bufobj; BO_LOCK(bo); VI_LOCK(vp); vn_printf(vp, "msdosfs_umount(): just before calling VOP_CLOSE()\n"); printf("freef %p, freeb %p, mount %p\n", TAILQ_NEXT(vp, v_vnodelist), vp->v_vnodelist.tqe_prev, vp->v_mount); printf("cleanblkhd %p, dirtyblkhd %p, numoutput %ld, type %d\n", TAILQ_FIRST(&vp->v_bufobj.bo_clean.bv_hd), TAILQ_FIRST(&vp->v_bufobj.bo_dirty.bv_hd), vp->v_bufobj.bo_numoutput, vp->v_type); VI_UNLOCK(vp); BO_UNLOCK(bo); } #endif g_topology_lock(); g_vfs_close(pmp->pm_cp); g_topology_unlock(); atomic_store_rel_ptr((uintptr_t *)&pmp->pm_dev->si_mountpt, 0); vrele(pmp->pm_devvp); dev_rel(pmp->pm_dev); free(pmp->pm_inusemap, M_MSDOSFSFAT); lockdestroy(&pmp->pm_fatlock); free(pmp, M_MSDOSFSMNT); mp->mnt_data = NULL; MNT_ILOCK(mp); mp->mnt_flag &= ~MNT_LOCAL; MNT_IUNLOCK(mp); return (error); } static int msdosfs_root(struct mount *mp, int flags, struct vnode **vpp) { struct msdosfsmount *pmp = VFSTOMSDOSFS(mp); struct denode *ndep; int error; #ifdef MSDOSFS_DEBUG printf("msdosfs_root(); mp %p, pmp %p\n", mp, pmp); #endif error = deget(pmp, MSDOSFSROOT, MSDOSFSROOT_OFS, &ndep); if (error) return (error); *vpp = DETOV(ndep); return (0); } static int msdosfs_statfs(struct mount *mp, struct statfs *sbp) { struct msdosfsmount *pmp; pmp = VFSTOMSDOSFS(mp); sbp->f_bsize = pmp->pm_bpcluster; sbp->f_iosize = pmp->pm_bpcluster; sbp->f_blocks = pmp->pm_maxcluster + 1; sbp->f_bfree = pmp->pm_freeclustercount; sbp->f_bavail = pmp->pm_freeclustercount; sbp->f_files = pmp->pm_RootDirEnts; /* XXX */ sbp->f_ffree = 0; /* what to put in here? */ return (0); } /* * If we have an FSInfo block, update it. */ static int msdosfs_fsiflush(struct msdosfsmount *pmp, int waitfor) { struct fsinfo *fp; struct buf *bp; int error; MSDOSFS_LOCK_MP(pmp); if (pmp->pm_fsinfo == 0 || (pmp->pm_flags & MSDOSFS_FSIMOD) == 0) { error = 0; goto unlock; } error = bread(pmp->pm_devvp, pmp->pm_fsinfo, pmp->pm_BytesPerSec, NOCRED, &bp); if (error != 0) { goto unlock; } fp = (struct fsinfo *)bp->b_data; putulong(fp->fsinfree, pmp->pm_freeclustercount); putulong(fp->fsinxtfree, pmp->pm_nxtfree); pmp->pm_flags &= ~MSDOSFS_FSIMOD; if (waitfor == MNT_WAIT) error = bwrite(bp); else bawrite(bp); unlock: MSDOSFS_UNLOCK_MP(pmp); return (error); } static int msdosfs_sync(struct mount *mp, int waitfor) { struct vnode *vp, *nvp; struct thread *td; struct denode *dep; struct msdosfsmount *pmp = VFSTOMSDOSFS(mp); int error, allerror = 0; td = curthread; /* * If we ever switch to not updating all of the FATs all the time, * this would be the place to update them from the first one. */ if (pmp->pm_fmod != 0) { if (pmp->pm_flags & MSDOSFSMNT_RONLY) panic("msdosfs_sync: rofs mod"); else { /* update FATs here */ } } /* * Write back each (modified) denode. */ loop: MNT_VNODE_FOREACH_ALL(vp, mp, nvp) { if (vp->v_type == VNON) { VI_UNLOCK(vp); continue; } dep = VTODE(vp); if ((dep->de_flag & (DE_ACCESS | DE_CREATE | DE_UPDATE | DE_MODIFIED)) == 0 && (vp->v_bufobj.bo_dirty.bv_cnt == 0 || waitfor == MNT_LAZY)) { VI_UNLOCK(vp); continue; } error = vget(vp, LK_EXCLUSIVE | LK_NOWAIT | LK_INTERLOCK); if (error) { if (error == ENOENT) { MNT_VNODE_FOREACH_ALL_ABORT(mp, nvp); goto loop; } continue; } error = VOP_FSYNC(vp, waitfor, td); if (error) allerror = error; VOP_UNLOCK(vp); vrele(vp); } /* * Flush filesystem control info. */ if (waitfor != MNT_LAZY) { vn_lock(pmp->pm_devvp, LK_EXCLUSIVE | LK_RETRY); error = VOP_FSYNC(pmp->pm_devvp, waitfor, td); if (error) allerror = error; VOP_UNLOCK(pmp->pm_devvp); } error = msdosfs_fsiflush(pmp, waitfor); if (error != 0) allerror = error; return (allerror); } static int msdosfs_fhtovp(struct mount *mp, struct fid *fhp, int flags, struct vnode **vpp) { struct msdosfsmount *pmp = VFSTOMSDOSFS(mp); struct defid *defhp = (struct defid *) fhp; struct denode *dep; int error; error = deget(pmp, defhp->defid_dirclust, defhp->defid_dirofs, &dep); if (error) { *vpp = NULLVP; return (error); } *vpp = DETOV(dep); vnode_create_vobject(*vpp, dep->de_FileSize, curthread); return (0); } static struct vfsops msdosfs_vfsops = { .vfs_fhtovp = msdosfs_fhtovp, .vfs_mount = msdosfs_mount, .vfs_cmount = msdosfs_cmount, .vfs_root = msdosfs_root, .vfs_statfs = msdosfs_statfs, .vfs_sync = msdosfs_sync, .vfs_unmount = msdosfs_unmount, }; VFS_SET(msdosfs_vfsops, msdosfs, 0); MODULE_VERSION(msdosfs, 1); Index: head/sys/fs/udf/udf_vfsops.c =================================================================== --- head/sys/fs/udf/udf_vfsops.c (revision 364371) +++ head/sys/fs/udf/udf_vfsops.c (revision 364372) @@ -1,825 +1,825 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2001, 2002 Scott Long * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ /* udf_vfsops.c */ /* Implement the VFS side of things */ /* * Ok, here's how it goes. The UDF specs are pretty clear on how each data * structure is made up, but not very clear on how they relate to each other. * Here is the skinny... This demostrates a filesystem with one file in the * root directory. Subdirectories are treated just as normal files, but they * have File Id Descriptors of their children as their file data. As for the * Anchor Volume Descriptor Pointer, it can exist in two of the following three * places: sector 256, sector n (the max sector of the disk), or sector * n - 256. It's a pretty good bet that one will exist at sector 256 though. * One caveat is unclosed CD media. For that, sector 256 cannot be written, * so the Anchor Volume Descriptor Pointer can exist at sector 512 until the * media is closed. * * Sector: * 256: * n: Anchor Volume Descriptor Pointer * n - 256: | * | * |-->Main Volume Descriptor Sequence * | | * | | * | |-->Logical Volume Descriptor * | | * |-->Partition Descriptor | * | | * | | * |-->Fileset Descriptor * | * | * |-->Root Dir File Entry * | * | * |-->File data: * File Id Descriptor * | * | * |-->File Entry * | * | * |-->File data */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static MALLOC_DEFINE(M_UDFMOUNT, "udf_mount", "UDF mount structure"); MALLOC_DEFINE(M_UDFFENTRY, "udf_fentry", "UDF file entry structure"); struct iconv_functions *udf_iconv = NULL; /* Zones */ uma_zone_t udf_zone_trans = NULL; uma_zone_t udf_zone_node = NULL; uma_zone_t udf_zone_ds = NULL; static vfs_init_t udf_init; static vfs_uninit_t udf_uninit; static vfs_mount_t udf_mount; static vfs_root_t udf_root; static vfs_statfs_t udf_statfs; static vfs_unmount_t udf_unmount; static vfs_fhtovp_t udf_fhtovp; static int udf_find_partmaps(struct udf_mnt *, struct logvol_desc *); static struct vfsops udf_vfsops = { .vfs_fhtovp = udf_fhtovp, .vfs_init = udf_init, .vfs_mount = udf_mount, .vfs_root = udf_root, .vfs_statfs = udf_statfs, .vfs_uninit = udf_uninit, .vfs_unmount = udf_unmount, .vfs_vget = udf_vget, }; VFS_SET(udf_vfsops, udf, VFCF_READONLY); MODULE_VERSION(udf, 1); static int udf_mountfs(struct vnode *, struct mount *); static int udf_init(struct vfsconf *foo) { /* * This code used to pre-allocate a certain number of pages for each * pool, reducing the need to grow the zones later on. UMA doesn't * advertise any such functionality, unfortunately =-< */ udf_zone_trans = uma_zcreate("UDF translation buffer, zone", MAXNAMLEN * sizeof(unicode_t), NULL, NULL, NULL, NULL, 0, 0); udf_zone_node = uma_zcreate("UDF Node zone", sizeof(struct udf_node), NULL, NULL, NULL, NULL, 0, 0); udf_zone_ds = uma_zcreate("UDF Dirstream zone", sizeof(struct udf_dirstream), NULL, NULL, NULL, NULL, 0, 0); if ((udf_zone_node == NULL) || (udf_zone_trans == NULL) || (udf_zone_ds == NULL)) { printf("Cannot create allocation zones.\n"); return (ENOMEM); } return 0; } static int udf_uninit(struct vfsconf *foo) { if (udf_zone_trans != NULL) { uma_zdestroy(udf_zone_trans); udf_zone_trans = NULL; } if (udf_zone_node != NULL) { uma_zdestroy(udf_zone_node); udf_zone_node = NULL; } if (udf_zone_ds != NULL) { uma_zdestroy(udf_zone_ds); udf_zone_ds = NULL; } return (0); } static int udf_mount(struct mount *mp) { struct vnode *devvp; /* vnode of the mount device */ struct thread *td; struct udf_mnt *imp = NULL; struct vfsoptlist *opts; char *fspec, *cs_disk, *cs_local; int error, len, *udf_flags; struct nameidata nd, *ndp = &nd; td = curthread; opts = mp->mnt_optnew; /* * Unconditionally mount as read-only. */ MNT_ILOCK(mp); mp->mnt_flag |= MNT_RDONLY; MNT_IUNLOCK(mp); /* * No root filesystem support. Probably not a big deal, since the * bootloader doesn't understand UDF. */ if (mp->mnt_flag & MNT_ROOTFS) return (ENOTSUP); fspec = NULL; error = vfs_getopt(opts, "from", (void **)&fspec, &len); if (!error && fspec[len - 1] != '\0') return (EINVAL); if (mp->mnt_flag & MNT_UPDATE) { return (0); } /* Check that the mount device exists */ if (fspec == NULL) return (EINVAL); NDINIT(ndp, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, fspec, td); if ((error = namei(ndp))) return (error); NDFREE(ndp, NDF_ONLY_PNBUF); devvp = ndp->ni_vp; - if (vn_isdisk(devvp, &error) == 0) { + if (!vn_isdisk_error(devvp, &error)) { vput(devvp); return (error); } /* Check the access rights on the mount device */ error = VOP_ACCESS(devvp, VREAD, td->td_ucred, td); if (error) error = priv_check(td, PRIV_VFS_MOUNT_PERM); if (error) { vput(devvp); return (error); } if ((error = udf_mountfs(devvp, mp))) { vrele(devvp); return (error); } imp = VFSTOUDFFS(mp); udf_flags = NULL; error = vfs_getopt(opts, "flags", (void **)&udf_flags, &len); if (error || len != sizeof(int)) return (EINVAL); imp->im_flags = *udf_flags; if (imp->im_flags & UDFMNT_KICONV && udf_iconv) { cs_disk = NULL; error = vfs_getopt(opts, "cs_disk", (void **)&cs_disk, &len); if (!error && cs_disk[len - 1] != '\0') return (EINVAL); cs_local = NULL; error = vfs_getopt(opts, "cs_local", (void **)&cs_local, &len); if (!error && cs_local[len - 1] != '\0') return (EINVAL); udf_iconv->open(cs_local, cs_disk, &imp->im_d2l); #if 0 udf_iconv->open(cs_disk, cs_local, &imp->im_l2d); #endif } vfs_mountedfrom(mp, fspec); return 0; }; /* * Check the descriptor tag for both the correct id and correct checksum. * Return zero if all is good, EINVAL if not. */ int udf_checktag(struct desc_tag *tag, uint16_t id) { uint8_t *itag; uint8_t i, cksum = 0; itag = (uint8_t *)tag; if (le16toh(tag->id) != id) return (EINVAL); for (i = 0; i < 16; i++) cksum = cksum + itag[i]; cksum = cksum - itag[4]; if (cksum == tag->cksum) return (0); return (EINVAL); } static int udf_mountfs(struct vnode *devvp, struct mount *mp) { struct buf *bp = NULL; struct cdev *dev; struct anchor_vdp avdp; struct udf_mnt *udfmp = NULL; struct part_desc *pd; struct logvol_desc *lvd; struct fileset_desc *fsd; struct file_entry *root_fentry; uint32_t sector, size, mvds_start, mvds_end; uint32_t logical_secsize; uint32_t fsd_offset = 0; uint16_t part_num = 0, fsd_part = 0; int error = EINVAL; int logvol_found = 0, part_found = 0, fsd_found = 0; int bsize; struct g_consumer *cp; struct bufobj *bo; dev = devvp->v_rdev; dev_ref(dev); g_topology_lock(); error = g_vfs_open(devvp, &cp, "udf", 0); g_topology_unlock(); VOP_UNLOCK(devvp); if (error) goto bail; bo = &devvp->v_bufobj; if (devvp->v_rdev->si_iosize_max != 0) mp->mnt_iosize_max = devvp->v_rdev->si_iosize_max; if (mp->mnt_iosize_max > MAXPHYS) mp->mnt_iosize_max = MAXPHYS; /* XXX: should be M_WAITOK */ udfmp = malloc(sizeof(struct udf_mnt), M_UDFMOUNT, M_NOWAIT | M_ZERO); if (udfmp == NULL) { printf("Cannot allocate UDF mount struct\n"); error = ENOMEM; goto bail; } mp->mnt_data = udfmp; mp->mnt_stat.f_fsid.val[0] = dev2udev(devvp->v_rdev); mp->mnt_stat.f_fsid.val[1] = mp->mnt_vfc->vfc_typenum; MNT_ILOCK(mp); mp->mnt_flag |= MNT_LOCAL; mp->mnt_kern_flag |= MNTK_LOOKUP_SHARED | MNTK_EXTENDED_SHARED; MNT_IUNLOCK(mp); udfmp->im_mountp = mp; udfmp->im_dev = dev; udfmp->im_devvp = devvp; udfmp->im_d2l = NULL; udfmp->im_cp = cp; udfmp->im_bo = bo; #if 0 udfmp->im_l2d = NULL; #endif /* * The UDF specification defines a logical sectorsize of 2048 * for DVD media. */ logical_secsize = 2048; if (((logical_secsize % cp->provider->sectorsize) != 0) || (logical_secsize < cp->provider->sectorsize)) { error = EINVAL; goto bail; } bsize = cp->provider->sectorsize; /* * Get the Anchor Volume Descriptor Pointer from sector 256. * XXX Should also check sector n - 256, n, and 512. */ sector = 256; if ((error = bread(devvp, sector * btodb(logical_secsize), bsize, NOCRED, &bp)) != 0) goto bail; if ((error = udf_checktag((struct desc_tag *)bp->b_data, TAGID_ANCHOR))) goto bail; bcopy(bp->b_data, &avdp, sizeof(struct anchor_vdp)); brelse(bp); bp = NULL; /* * Extract the Partition Descriptor and Logical Volume Descriptor * from the Volume Descriptor Sequence. * XXX Should we care about the partition type right now? * XXX What about multiple partitions? */ mvds_start = le32toh(avdp.main_vds_ex.loc); mvds_end = mvds_start + (le32toh(avdp.main_vds_ex.len) - 1) / bsize; for (sector = mvds_start; sector < mvds_end; sector++) { if ((error = bread(devvp, sector * btodb(logical_secsize), bsize, NOCRED, &bp)) != 0) { printf("Can't read sector %d of VDS\n", sector); goto bail; } lvd = (struct logvol_desc *)bp->b_data; if (!udf_checktag(&lvd->tag, TAGID_LOGVOL)) { udfmp->bsize = le32toh(lvd->lb_size); udfmp->bmask = udfmp->bsize - 1; udfmp->bshift = ffs(udfmp->bsize) - 1; fsd_part = le16toh(lvd->_lvd_use.fsd_loc.loc.part_num); fsd_offset = le32toh(lvd->_lvd_use.fsd_loc.loc.lb_num); if (udf_find_partmaps(udfmp, lvd)) break; logvol_found = 1; } pd = (struct part_desc *)bp->b_data; if (!udf_checktag(&pd->tag, TAGID_PARTITION)) { part_found = 1; part_num = le16toh(pd->part_num); udfmp->part_len = le32toh(pd->part_len); udfmp->part_start = le32toh(pd->start_loc); } brelse(bp); bp = NULL; if ((part_found) && (logvol_found)) break; } if (!part_found || !logvol_found) { error = EINVAL; goto bail; } if (fsd_part != part_num) { printf("FSD does not lie within the partition!\n"); error = EINVAL; goto bail; } /* * Grab the Fileset Descriptor * Thanks to Chuck McCrobie for pointing * me in the right direction here. */ sector = udfmp->part_start + fsd_offset; if ((error = RDSECTOR(devvp, sector, udfmp->bsize, &bp)) != 0) { printf("Cannot read sector %d of FSD\n", sector); goto bail; } fsd = (struct fileset_desc *)bp->b_data; if (!udf_checktag(&fsd->tag, TAGID_FSD)) { fsd_found = 1; bcopy(&fsd->rootdir_icb, &udfmp->root_icb, sizeof(struct long_ad)); } brelse(bp); bp = NULL; if (!fsd_found) { printf("Couldn't find the fsd\n"); error = EINVAL; goto bail; } /* * Find the file entry for the root directory. */ sector = le32toh(udfmp->root_icb.loc.lb_num) + udfmp->part_start; size = le32toh(udfmp->root_icb.len); if ((error = udf_readdevblks(udfmp, sector, size, &bp)) != 0) { printf("Cannot read sector %d\n", sector); goto bail; } root_fentry = (struct file_entry *)bp->b_data; if ((error = udf_checktag(&root_fentry->tag, TAGID_FENTRY))) { printf("Invalid root file entry!\n"); goto bail; } brelse(bp); bp = NULL; return 0; bail: if (udfmp != NULL) free(udfmp, M_UDFMOUNT); if (bp != NULL) brelse(bp); if (cp != NULL) { g_topology_lock(); g_vfs_close(cp); g_topology_unlock(); } dev_rel(dev); return error; }; static int udf_unmount(struct mount *mp, int mntflags) { struct udf_mnt *udfmp; int error, flags = 0; udfmp = VFSTOUDFFS(mp); if (mntflags & MNT_FORCE) flags |= FORCECLOSE; if ((error = vflush(mp, 0, flags, curthread))) return (error); if (udfmp->im_flags & UDFMNT_KICONV && udf_iconv) { if (udfmp->im_d2l) udf_iconv->close(udfmp->im_d2l); #if 0 if (udfmp->im_l2d) udf_iconv->close(udfmp->im_l2d); #endif } g_topology_lock(); g_vfs_close(udfmp->im_cp); g_topology_unlock(); vrele(udfmp->im_devvp); dev_rel(udfmp->im_dev); if (udfmp->s_table != NULL) free(udfmp->s_table, M_UDFMOUNT); free(udfmp, M_UDFMOUNT); mp->mnt_data = NULL; MNT_ILOCK(mp); mp->mnt_flag &= ~MNT_LOCAL; MNT_IUNLOCK(mp); return (0); } static int udf_root(struct mount *mp, int flags, struct vnode **vpp) { struct udf_mnt *udfmp; ino_t id; udfmp = VFSTOUDFFS(mp); id = udf_getid(&udfmp->root_icb); return (udf_vget(mp, id, flags, vpp)); } static int udf_statfs(struct mount *mp, struct statfs *sbp) { struct udf_mnt *udfmp; udfmp = VFSTOUDFFS(mp); sbp->f_bsize = udfmp->bsize; sbp->f_iosize = udfmp->bsize; sbp->f_blocks = udfmp->part_len; sbp->f_bfree = 0; sbp->f_bavail = 0; sbp->f_files = 0; sbp->f_ffree = 0; return 0; } int udf_vget(struct mount *mp, ino_t ino, int flags, struct vnode **vpp) { struct buf *bp; struct vnode *devvp; struct udf_mnt *udfmp; struct thread *td; struct vnode *vp; struct udf_node *unode; struct file_entry *fe; int error, sector, size; error = vfs_hash_get(mp, ino, flags, curthread, vpp, NULL, NULL); if (error || *vpp != NULL) return (error); /* * We must promote to an exclusive lock for vnode creation. This * can happen if lookup is passed LOCKSHARED. */ if ((flags & LK_TYPE_MASK) == LK_SHARED) { flags &= ~LK_TYPE_MASK; flags |= LK_EXCLUSIVE; } /* * We do not lock vnode creation as it is believed to be too * expensive for such rare case as simultaneous creation of vnode * for same ino by different processes. We just allow them to race * and check later to decide who wins. Let the race begin! */ td = curthread; udfmp = VFSTOUDFFS(mp); unode = uma_zalloc(udf_zone_node, M_WAITOK | M_ZERO); if ((error = udf_allocv(mp, &vp, td))) { printf("Error from udf_allocv\n"); uma_zfree(udf_zone_node, unode); return (error); } unode->i_vnode = vp; unode->hash_id = ino; unode->udfmp = udfmp; vp->v_data = unode; lockmgr(vp->v_vnlock, LK_EXCLUSIVE, NULL); error = insmntque(vp, mp); if (error != 0) { uma_zfree(udf_zone_node, unode); return (error); } error = vfs_hash_insert(vp, ino, flags, td, vpp, NULL, NULL); if (error || *vpp != NULL) return (error); /* * Copy in the file entry. Per the spec, the size can only be 1 block. */ sector = ino + udfmp->part_start; devvp = udfmp->im_devvp; if ((error = RDSECTOR(devvp, sector, udfmp->bsize, &bp)) != 0) { printf("Cannot read sector %d\n", sector); vgone(vp); vput(vp); brelse(bp); *vpp = NULL; return (error); } fe = (struct file_entry *)bp->b_data; if (udf_checktag(&fe->tag, TAGID_FENTRY)) { printf("Invalid file entry!\n"); vgone(vp); vput(vp); brelse(bp); *vpp = NULL; return (ENOMEM); } size = UDF_FENTRY_SIZE + le32toh(fe->l_ea) + le32toh(fe->l_ad); unode->fentry = malloc(size, M_UDFFENTRY, M_NOWAIT | M_ZERO); if (unode->fentry == NULL) { printf("Cannot allocate file entry block\n"); vgone(vp); vput(vp); brelse(bp); *vpp = NULL; return (ENOMEM); } bcopy(bp->b_data, unode->fentry, size); brelse(bp); bp = NULL; switch (unode->fentry->icbtag.file_type) { default: vp->v_type = VBAD; break; case 4: vp->v_type = VDIR; break; case 5: vp->v_type = VREG; break; case 6: vp->v_type = VBLK; break; case 7: vp->v_type = VCHR; break; case 9: vp->v_type = VFIFO; vp->v_op = &udf_fifoops; break; case 10: vp->v_type = VSOCK; break; case 12: vp->v_type = VLNK; break; } if (vp->v_type != VFIFO) VN_LOCK_ASHARE(vp); if (ino == udf_getid(&udfmp->root_icb)) vp->v_vflag |= VV_ROOT; *vpp = vp; return (0); } static int udf_fhtovp(struct mount *mp, struct fid *fhp, int flags, struct vnode **vpp) { struct ifid *ifhp; struct vnode *nvp; struct udf_node *np; off_t fsize; int error; ifhp = (struct ifid *)fhp; if ((error = VFS_VGET(mp, ifhp->ifid_ino, LK_EXCLUSIVE, &nvp)) != 0) { *vpp = NULLVP; return (error); } np = VTON(nvp); fsize = le64toh(np->fentry->inf_len); *vpp = nvp; vnode_create_vobject(*vpp, fsize, curthread); return (0); } static int udf_find_partmaps(struct udf_mnt *udfmp, struct logvol_desc *lvd) { struct part_map_spare *pms; struct regid *pmap_id; struct buf *bp; unsigned char regid_id[UDF_REGID_ID_SIZE + 1]; int i, k, ptype, psize, error; uint8_t *pmap = (uint8_t *) &lvd->maps[0]; for (i = 0; i < le32toh(lvd->n_pm); i++) { ptype = pmap[0]; psize = pmap[1]; if (((ptype != 1) && (ptype != 2)) || ((psize != UDF_PMAP_TYPE1_SIZE) && (psize != UDF_PMAP_TYPE2_SIZE))) { printf("Invalid partition map found\n"); return (1); } if (ptype == 1) { /* Type 1 map. We don't care */ pmap += UDF_PMAP_TYPE1_SIZE; continue; } /* Type 2 map. Gotta find out the details */ pmap_id = (struct regid *)&pmap[4]; bzero(®id_id[0], UDF_REGID_ID_SIZE); bcopy(&pmap_id->id[0], ®id_id[0], UDF_REGID_ID_SIZE); if (bcmp(®id_id[0], "*UDF Sparable Partition", UDF_REGID_ID_SIZE)) { printf("Unsupported partition map: %s\n", ®id_id[0]); return (1); } pms = (struct part_map_spare *)pmap; pmap += UDF_PMAP_TYPE2_SIZE; udfmp->s_table = malloc(le32toh(pms->st_size), M_UDFMOUNT, M_NOWAIT | M_ZERO); if (udfmp->s_table == NULL) return (ENOMEM); /* Calculate the number of sectors per packet. */ /* XXX Logical or physical? */ udfmp->p_sectors = le16toh(pms->packet_len) / udfmp->bsize; /* * XXX If reading the first Sparing Table fails, should look * for another table. */ if ((error = udf_readdevblks(udfmp, le32toh(pms->st_loc[0]), le32toh(pms->st_size), &bp)) != 0) { if (bp != NULL) brelse(bp); printf("Failed to read Sparing Table at sector %d\n", le32toh(pms->st_loc[0])); free(udfmp->s_table, M_UDFMOUNT); return (error); } bcopy(bp->b_data, udfmp->s_table, le32toh(pms->st_size)); brelse(bp); if (udf_checktag(&udfmp->s_table->tag, 0)) { printf("Invalid sparing table found\n"); free(udfmp->s_table, M_UDFMOUNT); return (EINVAL); } /* See how many valid entries there are here. The list is * supposed to be sorted. 0xfffffff0 and higher are not valid */ for (k = 0; k < le16toh(udfmp->s_table->rt_l); k++) { udfmp->s_table_entries = k; if (le32toh(udfmp->s_table->entries[k].org) >= 0xfffffff0) break; } } return (0); } Index: head/sys/kern/vfs_aio.c =================================================================== --- head/sys/kern/vfs_aio.c (revision 364371) +++ head/sys/kern/vfs_aio.c (revision 364372) @@ -1,2987 +1,2987 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 1997 John S. Dyson. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. John S. Dyson's name may not be used to endorse or promote products * derived from this software without specific prior written permission. * * DISCLAIMER: This code isn't warranted to do anything useful. Anything * bad that happens because of using this software isn't the responsibility * of the author. This software is distributed AS-IS. */ /* * This file contains support for the POSIX 1003.1B AIO/LIO facility. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * Counter for allocating reference ids to new jobs. Wrapped to 1 on * overflow. (XXX will be removed soon.) */ static u_long jobrefid; /* * Counter for aio_fsync. */ static uint64_t jobseqno; #ifndef MAX_AIO_PER_PROC #define MAX_AIO_PER_PROC 32 #endif #ifndef MAX_AIO_QUEUE_PER_PROC #define MAX_AIO_QUEUE_PER_PROC 256 #endif #ifndef MAX_AIO_QUEUE #define MAX_AIO_QUEUE 1024 /* Bigger than MAX_AIO_QUEUE_PER_PROC */ #endif #ifndef MAX_BUF_AIO #define MAX_BUF_AIO 16 #endif FEATURE(aio, "Asynchronous I/O"); SYSCTL_DECL(_p1003_1b); static MALLOC_DEFINE(M_LIO, "lio", "listio aio control block list"); static MALLOC_DEFINE(M_AIOS, "aios", "aio_suspend aio control block list"); static SYSCTL_NODE(_vfs, OID_AUTO, aio, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "Async IO management"); static int enable_aio_unsafe = 0; SYSCTL_INT(_vfs_aio, OID_AUTO, enable_unsafe, CTLFLAG_RW, &enable_aio_unsafe, 0, "Permit asynchronous IO on all file types, not just known-safe types"); static unsigned int unsafe_warningcnt = 1; SYSCTL_UINT(_vfs_aio, OID_AUTO, unsafe_warningcnt, CTLFLAG_RW, &unsafe_warningcnt, 0, "Warnings that will be triggered upon failed IO requests on unsafe files"); static int max_aio_procs = MAX_AIO_PROCS; SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_procs, CTLFLAG_RW, &max_aio_procs, 0, "Maximum number of kernel processes to use for handling async IO "); static int num_aio_procs = 0; SYSCTL_INT(_vfs_aio, OID_AUTO, num_aio_procs, CTLFLAG_RD, &num_aio_procs, 0, "Number of presently active kernel processes for async IO"); /* * The code will adjust the actual number of AIO processes towards this * number when it gets a chance. */ static int target_aio_procs = TARGET_AIO_PROCS; SYSCTL_INT(_vfs_aio, OID_AUTO, target_aio_procs, CTLFLAG_RW, &target_aio_procs, 0, "Preferred number of ready kernel processes for async IO"); static int max_queue_count = MAX_AIO_QUEUE; SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_queue, CTLFLAG_RW, &max_queue_count, 0, "Maximum number of aio requests to queue, globally"); static int num_queue_count = 0; SYSCTL_INT(_vfs_aio, OID_AUTO, num_queue_count, CTLFLAG_RD, &num_queue_count, 0, "Number of queued aio requests"); static int num_buf_aio = 0; SYSCTL_INT(_vfs_aio, OID_AUTO, num_buf_aio, CTLFLAG_RD, &num_buf_aio, 0, "Number of aio requests presently handled by the buf subsystem"); static int num_unmapped_aio = 0; SYSCTL_INT(_vfs_aio, OID_AUTO, num_unmapped_aio, CTLFLAG_RD, &num_unmapped_aio, 0, "Number of aio requests presently handled by unmapped I/O buffers"); /* Number of async I/O processes in the process of being started */ /* XXX This should be local to aio_aqueue() */ static int num_aio_resv_start = 0; static int aiod_lifetime; SYSCTL_INT(_vfs_aio, OID_AUTO, aiod_lifetime, CTLFLAG_RW, &aiod_lifetime, 0, "Maximum lifetime for idle aiod"); static int max_aio_per_proc = MAX_AIO_PER_PROC; SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_per_proc, CTLFLAG_RW, &max_aio_per_proc, 0, "Maximum active aio requests per process"); static int max_aio_queue_per_proc = MAX_AIO_QUEUE_PER_PROC; SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_queue_per_proc, CTLFLAG_RW, &max_aio_queue_per_proc, 0, "Maximum queued aio requests per process"); static int max_buf_aio = MAX_BUF_AIO; SYSCTL_INT(_vfs_aio, OID_AUTO, max_buf_aio, CTLFLAG_RW, &max_buf_aio, 0, "Maximum buf aio requests per process"); /* * Though redundant with vfs.aio.max_aio_queue_per_proc, POSIX requires * sysconf(3) to support AIO_LISTIO_MAX, and we implement that with * vfs.aio.aio_listio_max. */ SYSCTL_INT(_p1003_1b, CTL_P1003_1B_AIO_LISTIO_MAX, aio_listio_max, CTLFLAG_RD | CTLFLAG_CAPRD, &max_aio_queue_per_proc, 0, "Maximum aio requests for a single lio_listio call"); #ifdef COMPAT_FREEBSD6 typedef struct oaiocb { int aio_fildes; /* File descriptor */ off_t aio_offset; /* File offset for I/O */ volatile void *aio_buf; /* I/O buffer in process space */ size_t aio_nbytes; /* Number of bytes for I/O */ struct osigevent aio_sigevent; /* Signal to deliver */ int aio_lio_opcode; /* LIO opcode */ int aio_reqprio; /* Request priority -- ignored */ struct __aiocb_private _aiocb_private; } oaiocb_t; #endif /* * Below is a key of locks used to protect each member of struct kaiocb * aioliojob and kaioinfo and any backends. * * * - need not protected * a - locked by kaioinfo lock * b - locked by backend lock, the backend lock can be null in some cases, * for example, BIO belongs to this type, in this case, proc lock is * reused. * c - locked by aio_job_mtx, the lock for the generic file I/O backend. */ /* * If the routine that services an AIO request blocks while running in an * AIO kernel process it can starve other I/O requests. BIO requests * queued via aio_qbio() complete asynchronously and do not use AIO kernel * processes at all. Socket I/O requests use a separate pool of * kprocs and also force non-blocking I/O. Other file I/O requests * use the generic fo_read/fo_write operations which can block. The * fsync and mlock operations can also block while executing. Ideally * none of these requests would block while executing. * * Note that the service routines cannot toggle O_NONBLOCK in the file * structure directly while handling a request due to races with * userland threads. */ /* jobflags */ #define KAIOCB_QUEUEING 0x01 #define KAIOCB_CANCELLED 0x02 #define KAIOCB_CANCELLING 0x04 #define KAIOCB_CHECKSYNC 0x08 #define KAIOCB_CLEARED 0x10 #define KAIOCB_FINISHED 0x20 /* * AIO process info */ #define AIOP_FREE 0x1 /* proc on free queue */ struct aioproc { int aioprocflags; /* (c) AIO proc flags */ TAILQ_ENTRY(aioproc) list; /* (c) list of processes */ struct proc *aioproc; /* (*) the AIO proc */ }; /* * data-structure for lio signal management */ struct aioliojob { int lioj_flags; /* (a) listio flags */ int lioj_count; /* (a) listio flags */ int lioj_finished_count; /* (a) listio flags */ struct sigevent lioj_signal; /* (a) signal on all I/O done */ TAILQ_ENTRY(aioliojob) lioj_list; /* (a) lio list */ struct knlist klist; /* (a) list of knotes */ ksiginfo_t lioj_ksi; /* (a) Realtime signal info */ }; #define LIOJ_SIGNAL 0x1 /* signal on all done (lio) */ #define LIOJ_SIGNAL_POSTED 0x2 /* signal has been posted */ #define LIOJ_KEVENT_POSTED 0x4 /* kevent triggered */ /* * per process aio data structure */ struct kaioinfo { struct mtx kaio_mtx; /* the lock to protect this struct */ int kaio_flags; /* (a) per process kaio flags */ int kaio_active_count; /* (c) number of currently used AIOs */ int kaio_count; /* (a) size of AIO queue */ int kaio_buffer_count; /* (a) number of bio buffers */ TAILQ_HEAD(,kaiocb) kaio_all; /* (a) all AIOs in a process */ TAILQ_HEAD(,kaiocb) kaio_done; /* (a) done queue for process */ TAILQ_HEAD(,aioliojob) kaio_liojoblist; /* (a) list of lio jobs */ TAILQ_HEAD(,kaiocb) kaio_jobqueue; /* (a) job queue for process */ TAILQ_HEAD(,kaiocb) kaio_syncqueue; /* (a) queue for aio_fsync */ TAILQ_HEAD(,kaiocb) kaio_syncready; /* (a) second q for aio_fsync */ struct task kaio_task; /* (*) task to kick aio processes */ struct task kaio_sync_task; /* (*) task to schedule fsync jobs */ }; #define AIO_LOCK(ki) mtx_lock(&(ki)->kaio_mtx) #define AIO_UNLOCK(ki) mtx_unlock(&(ki)->kaio_mtx) #define AIO_LOCK_ASSERT(ki, f) mtx_assert(&(ki)->kaio_mtx, (f)) #define AIO_MTX(ki) (&(ki)->kaio_mtx) #define KAIO_RUNDOWN 0x1 /* process is being run down */ #define KAIO_WAKEUP 0x2 /* wakeup process when AIO completes */ /* * Operations used to interact with userland aio control blocks. * Different ABIs provide their own operations. */ struct aiocb_ops { int (*aio_copyin)(struct aiocb *ujob, struct aiocb *kjob); long (*fetch_status)(struct aiocb *ujob); long (*fetch_error)(struct aiocb *ujob); int (*store_status)(struct aiocb *ujob, long status); int (*store_error)(struct aiocb *ujob, long error); int (*store_kernelinfo)(struct aiocb *ujob, long jobref); int (*store_aiocb)(struct aiocb **ujobp, struct aiocb *ujob); }; static TAILQ_HEAD(,aioproc) aio_freeproc; /* (c) Idle daemons */ static struct sema aio_newproc_sem; static struct mtx aio_job_mtx; static TAILQ_HEAD(,kaiocb) aio_jobs; /* (c) Async job list */ static struct unrhdr *aiod_unr; void aio_init_aioinfo(struct proc *p); static int aio_onceonly(void); static int aio_free_entry(struct kaiocb *job); static void aio_process_rw(struct kaiocb *job); static void aio_process_sync(struct kaiocb *job); static void aio_process_mlock(struct kaiocb *job); static void aio_schedule_fsync(void *context, int pending); static int aio_newproc(int *); int aio_aqueue(struct thread *td, struct aiocb *ujob, struct aioliojob *lio, int type, struct aiocb_ops *ops); static int aio_queue_file(struct file *fp, struct kaiocb *job); static void aio_biowakeup(struct bio *bp); static void aio_proc_rundown(void *arg, struct proc *p); static void aio_proc_rundown_exec(void *arg, struct proc *p, struct image_params *imgp); static int aio_qbio(struct proc *p, struct kaiocb *job); static void aio_daemon(void *param); static void aio_bio_done_notify(struct proc *userp, struct kaiocb *job); static bool aio_clear_cancel_function_locked(struct kaiocb *job); static int aio_kick(struct proc *userp); static void aio_kick_nowait(struct proc *userp); static void aio_kick_helper(void *context, int pending); static int filt_aioattach(struct knote *kn); static void filt_aiodetach(struct knote *kn); static int filt_aio(struct knote *kn, long hint); static int filt_lioattach(struct knote *kn); static void filt_liodetach(struct knote *kn); static int filt_lio(struct knote *kn, long hint); /* * Zones for: * kaio Per process async io info * aiop async io process data * aiocb async io jobs * aiolio list io jobs */ static uma_zone_t kaio_zone, aiop_zone, aiocb_zone, aiolio_zone; /* kqueue filters for aio */ static struct filterops aio_filtops = { .f_isfd = 0, .f_attach = filt_aioattach, .f_detach = filt_aiodetach, .f_event = filt_aio, }; static struct filterops lio_filtops = { .f_isfd = 0, .f_attach = filt_lioattach, .f_detach = filt_liodetach, .f_event = filt_lio }; static eventhandler_tag exit_tag, exec_tag; TASKQUEUE_DEFINE_THREAD(aiod_kick); /* * Main operations function for use as a kernel module. */ static int aio_modload(struct module *module, int cmd, void *arg) { int error = 0; switch (cmd) { case MOD_LOAD: aio_onceonly(); break; case MOD_SHUTDOWN: break; default: error = EOPNOTSUPP; break; } return (error); } static moduledata_t aio_mod = { "aio", &aio_modload, NULL }; DECLARE_MODULE(aio, aio_mod, SI_SUB_VFS, SI_ORDER_ANY); MODULE_VERSION(aio, 1); /* * Startup initialization */ static int aio_onceonly(void) { exit_tag = EVENTHANDLER_REGISTER(process_exit, aio_proc_rundown, NULL, EVENTHANDLER_PRI_ANY); exec_tag = EVENTHANDLER_REGISTER(process_exec, aio_proc_rundown_exec, NULL, EVENTHANDLER_PRI_ANY); kqueue_add_filteropts(EVFILT_AIO, &aio_filtops); kqueue_add_filteropts(EVFILT_LIO, &lio_filtops); TAILQ_INIT(&aio_freeproc); sema_init(&aio_newproc_sem, 0, "aio_new_proc"); mtx_init(&aio_job_mtx, "aio_job", NULL, MTX_DEF); TAILQ_INIT(&aio_jobs); aiod_unr = new_unrhdr(1, INT_MAX, NULL); kaio_zone = uma_zcreate("AIO", sizeof(struct kaioinfo), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); aiop_zone = uma_zcreate("AIOP", sizeof(struct aioproc), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); aiocb_zone = uma_zcreate("AIOCB", sizeof(struct kaiocb), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); aiolio_zone = uma_zcreate("AIOLIO", sizeof(struct aioliojob), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); aiod_lifetime = AIOD_LIFETIME_DEFAULT; jobrefid = 1; p31b_setcfg(CTL_P1003_1B_ASYNCHRONOUS_IO, _POSIX_ASYNCHRONOUS_IO); p31b_setcfg(CTL_P1003_1B_AIO_MAX, MAX_AIO_QUEUE); p31b_setcfg(CTL_P1003_1B_AIO_PRIO_DELTA_MAX, 0); return (0); } /* * Init the per-process aioinfo structure. The aioinfo limits are set * per-process for user limit (resource) management. */ void aio_init_aioinfo(struct proc *p) { struct kaioinfo *ki; ki = uma_zalloc(kaio_zone, M_WAITOK); mtx_init(&ki->kaio_mtx, "aiomtx", NULL, MTX_DEF | MTX_NEW); ki->kaio_flags = 0; ki->kaio_active_count = 0; ki->kaio_count = 0; ki->kaio_buffer_count = 0; TAILQ_INIT(&ki->kaio_all); TAILQ_INIT(&ki->kaio_done); TAILQ_INIT(&ki->kaio_jobqueue); TAILQ_INIT(&ki->kaio_liojoblist); TAILQ_INIT(&ki->kaio_syncqueue); TAILQ_INIT(&ki->kaio_syncready); TASK_INIT(&ki->kaio_task, 0, aio_kick_helper, p); TASK_INIT(&ki->kaio_sync_task, 0, aio_schedule_fsync, ki); PROC_LOCK(p); if (p->p_aioinfo == NULL) { p->p_aioinfo = ki; PROC_UNLOCK(p); } else { PROC_UNLOCK(p); mtx_destroy(&ki->kaio_mtx); uma_zfree(kaio_zone, ki); } while (num_aio_procs < MIN(target_aio_procs, max_aio_procs)) aio_newproc(NULL); } static int aio_sendsig(struct proc *p, struct sigevent *sigev, ksiginfo_t *ksi) { struct thread *td; int error; error = sigev_findtd(p, sigev, &td); if (error) return (error); if (!KSI_ONQ(ksi)) { ksiginfo_set_sigev(ksi, sigev); ksi->ksi_code = SI_ASYNCIO; ksi->ksi_flags |= KSI_EXT | KSI_INS; tdsendsignal(p, td, ksi->ksi_signo, ksi); } PROC_UNLOCK(p); return (error); } /* * Free a job entry. Wait for completion if it is currently active, but don't * delay forever. If we delay, we return a flag that says that we have to * restart the queue scan. */ static int aio_free_entry(struct kaiocb *job) { struct kaioinfo *ki; struct aioliojob *lj; struct proc *p; p = job->userproc; MPASS(curproc == p); ki = p->p_aioinfo; MPASS(ki != NULL); AIO_LOCK_ASSERT(ki, MA_OWNED); MPASS(job->jobflags & KAIOCB_FINISHED); atomic_subtract_int(&num_queue_count, 1); ki->kaio_count--; MPASS(ki->kaio_count >= 0); TAILQ_REMOVE(&ki->kaio_done, job, plist); TAILQ_REMOVE(&ki->kaio_all, job, allist); lj = job->lio; if (lj) { lj->lioj_count--; lj->lioj_finished_count--; if (lj->lioj_count == 0) { TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list); /* lio is going away, we need to destroy any knotes */ knlist_delete(&lj->klist, curthread, 1); PROC_LOCK(p); sigqueue_take(&lj->lioj_ksi); PROC_UNLOCK(p); uma_zfree(aiolio_zone, lj); } } /* job is going away, we need to destroy any knotes */ knlist_delete(&job->klist, curthread, 1); PROC_LOCK(p); sigqueue_take(&job->ksi); PROC_UNLOCK(p); AIO_UNLOCK(ki); /* * The thread argument here is used to find the owning process * and is also passed to fo_close() which may pass it to various * places such as devsw close() routines. Because of that, we * need a thread pointer from the process owning the job that is * persistent and won't disappear out from under us or move to * another process. * * Currently, all the callers of this function call it to remove * a kaiocb from the current process' job list either via a * syscall or due to the current process calling exit() or * execve(). Thus, we know that p == curproc. We also know that * curthread can't exit since we are curthread. * * Therefore, we use curthread as the thread to pass to * knlist_delete(). This does mean that it is possible for the * thread pointer at close time to differ from the thread pointer * at open time, but this is already true of file descriptors in * a multithreaded process. */ if (job->fd_file) fdrop(job->fd_file, curthread); crfree(job->cred); uma_zfree(aiocb_zone, job); AIO_LOCK(ki); return (0); } static void aio_proc_rundown_exec(void *arg, struct proc *p, struct image_params *imgp __unused) { aio_proc_rundown(arg, p); } static int aio_cancel_job(struct proc *p, struct kaioinfo *ki, struct kaiocb *job) { aio_cancel_fn_t *func; int cancelled; AIO_LOCK_ASSERT(ki, MA_OWNED); if (job->jobflags & (KAIOCB_CANCELLED | KAIOCB_FINISHED)) return (0); MPASS((job->jobflags & KAIOCB_CANCELLING) == 0); job->jobflags |= KAIOCB_CANCELLED; func = job->cancel_fn; /* * If there is no cancel routine, just leave the job marked as * cancelled. The job should be in active use by a caller who * should complete it normally or when it fails to install a * cancel routine. */ if (func == NULL) return (0); /* * Set the CANCELLING flag so that aio_complete() will defer * completions of this job. This prevents the job from being * freed out from under the cancel callback. After the * callback any deferred completion (whether from the callback * or any other source) will be completed. */ job->jobflags |= KAIOCB_CANCELLING; AIO_UNLOCK(ki); func(job); AIO_LOCK(ki); job->jobflags &= ~KAIOCB_CANCELLING; if (job->jobflags & KAIOCB_FINISHED) { cancelled = job->uaiocb._aiocb_private.error == ECANCELED; TAILQ_REMOVE(&ki->kaio_jobqueue, job, plist); aio_bio_done_notify(p, job); } else { /* * The cancel callback might have scheduled an * operation to cancel this request, but it is * only counted as cancelled if the request is * cancelled when the callback returns. */ cancelled = 0; } return (cancelled); } /* * Rundown the jobs for a given process. */ static void aio_proc_rundown(void *arg, struct proc *p) { struct kaioinfo *ki; struct aioliojob *lj; struct kaiocb *job, *jobn; KASSERT(curthread->td_proc == p, ("%s: called on non-curproc", __func__)); ki = p->p_aioinfo; if (ki == NULL) return; AIO_LOCK(ki); ki->kaio_flags |= KAIO_RUNDOWN; restart: /* * Try to cancel all pending requests. This code simulates * aio_cancel on all pending I/O requests. */ TAILQ_FOREACH_SAFE(job, &ki->kaio_jobqueue, plist, jobn) { aio_cancel_job(p, ki, job); } /* Wait for all running I/O to be finished */ if (TAILQ_FIRST(&ki->kaio_jobqueue) || ki->kaio_active_count != 0) { ki->kaio_flags |= KAIO_WAKEUP; msleep(&p->p_aioinfo, AIO_MTX(ki), PRIBIO, "aioprn", hz); goto restart; } /* Free all completed I/O requests. */ while ((job = TAILQ_FIRST(&ki->kaio_done)) != NULL) aio_free_entry(job); while ((lj = TAILQ_FIRST(&ki->kaio_liojoblist)) != NULL) { if (lj->lioj_count == 0) { TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list); knlist_delete(&lj->klist, curthread, 1); PROC_LOCK(p); sigqueue_take(&lj->lioj_ksi); PROC_UNLOCK(p); uma_zfree(aiolio_zone, lj); } else { panic("LIO job not cleaned up: C:%d, FC:%d\n", lj->lioj_count, lj->lioj_finished_count); } } AIO_UNLOCK(ki); taskqueue_drain(taskqueue_aiod_kick, &ki->kaio_task); taskqueue_drain(taskqueue_aiod_kick, &ki->kaio_sync_task); mtx_destroy(&ki->kaio_mtx); uma_zfree(kaio_zone, ki); p->p_aioinfo = NULL; } /* * Select a job to run (called by an AIO daemon). */ static struct kaiocb * aio_selectjob(struct aioproc *aiop) { struct kaiocb *job; struct kaioinfo *ki; struct proc *userp; mtx_assert(&aio_job_mtx, MA_OWNED); restart: TAILQ_FOREACH(job, &aio_jobs, list) { userp = job->userproc; ki = userp->p_aioinfo; if (ki->kaio_active_count < max_aio_per_proc) { TAILQ_REMOVE(&aio_jobs, job, list); if (!aio_clear_cancel_function(job)) goto restart; /* Account for currently active jobs. */ ki->kaio_active_count++; break; } } return (job); } /* * Move all data to a permanent storage device. This code * simulates the fsync syscall. */ static int aio_fsync_vnode(struct thread *td, struct vnode *vp) { struct mount *mp; int error; if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) goto drop; vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); if (vp->v_object != NULL) { VM_OBJECT_WLOCK(vp->v_object); vm_object_page_clean(vp->v_object, 0, 0, 0); VM_OBJECT_WUNLOCK(vp->v_object); } error = VOP_FSYNC(vp, MNT_WAIT, td); VOP_UNLOCK(vp); vn_finished_write(mp); drop: return (error); } /* * The AIO processing activity for LIO_READ/LIO_WRITE. This is the code that * does the I/O request for the non-bio version of the operations. The normal * vn operations are used, and this code should work in all instances for every * type of file, including pipes, sockets, fifos, and regular files. * * XXX I don't think it works well for socket, pipe, and fifo. */ static void aio_process_rw(struct kaiocb *job) { struct ucred *td_savedcred; struct thread *td; struct aiocb *cb; struct file *fp; struct uio auio; struct iovec aiov; ssize_t cnt; long msgsnd_st, msgsnd_end; long msgrcv_st, msgrcv_end; long oublock_st, oublock_end; long inblock_st, inblock_end; int error; KASSERT(job->uaiocb.aio_lio_opcode == LIO_READ || job->uaiocb.aio_lio_opcode == LIO_WRITE, ("%s: opcode %d", __func__, job->uaiocb.aio_lio_opcode)); aio_switch_vmspace(job); td = curthread; td_savedcred = td->td_ucred; td->td_ucred = job->cred; cb = &job->uaiocb; fp = job->fd_file; aiov.iov_base = (void *)(uintptr_t)cb->aio_buf; aiov.iov_len = cb->aio_nbytes; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_offset = cb->aio_offset; auio.uio_resid = cb->aio_nbytes; cnt = cb->aio_nbytes; auio.uio_segflg = UIO_USERSPACE; auio.uio_td = td; msgrcv_st = td->td_ru.ru_msgrcv; msgsnd_st = td->td_ru.ru_msgsnd; inblock_st = td->td_ru.ru_inblock; oublock_st = td->td_ru.ru_oublock; /* * aio_aqueue() acquires a reference to the file that is * released in aio_free_entry(). */ if (cb->aio_lio_opcode == LIO_READ) { auio.uio_rw = UIO_READ; if (auio.uio_resid == 0) error = 0; else error = fo_read(fp, &auio, fp->f_cred, FOF_OFFSET, td); } else { if (fp->f_type == DTYPE_VNODE) bwillwrite(); auio.uio_rw = UIO_WRITE; error = fo_write(fp, &auio, fp->f_cred, FOF_OFFSET, td); } msgrcv_end = td->td_ru.ru_msgrcv; msgsnd_end = td->td_ru.ru_msgsnd; inblock_end = td->td_ru.ru_inblock; oublock_end = td->td_ru.ru_oublock; job->msgrcv = msgrcv_end - msgrcv_st; job->msgsnd = msgsnd_end - msgsnd_st; job->inblock = inblock_end - inblock_st; job->outblock = oublock_end - oublock_st; if ((error) && (auio.uio_resid != cnt)) { if (error == ERESTART || error == EINTR || error == EWOULDBLOCK) error = 0; if ((error == EPIPE) && (cb->aio_lio_opcode == LIO_WRITE)) { PROC_LOCK(job->userproc); kern_psignal(job->userproc, SIGPIPE); PROC_UNLOCK(job->userproc); } } cnt -= auio.uio_resid; td->td_ucred = td_savedcred; if (error) aio_complete(job, -1, error); else aio_complete(job, cnt, 0); } static void aio_process_sync(struct kaiocb *job) { struct thread *td = curthread; struct ucred *td_savedcred = td->td_ucred; struct file *fp = job->fd_file; int error = 0; KASSERT(job->uaiocb.aio_lio_opcode == LIO_SYNC, ("%s: opcode %d", __func__, job->uaiocb.aio_lio_opcode)); td->td_ucred = job->cred; if (fp->f_vnode != NULL) error = aio_fsync_vnode(td, fp->f_vnode); td->td_ucred = td_savedcred; if (error) aio_complete(job, -1, error); else aio_complete(job, 0, 0); } static void aio_process_mlock(struct kaiocb *job) { struct aiocb *cb = &job->uaiocb; int error; KASSERT(job->uaiocb.aio_lio_opcode == LIO_MLOCK, ("%s: opcode %d", __func__, job->uaiocb.aio_lio_opcode)); aio_switch_vmspace(job); error = kern_mlock(job->userproc, job->cred, __DEVOLATILE(uintptr_t, cb->aio_buf), cb->aio_nbytes); aio_complete(job, error != 0 ? -1 : 0, error); } static void aio_bio_done_notify(struct proc *userp, struct kaiocb *job) { struct aioliojob *lj; struct kaioinfo *ki; struct kaiocb *sjob, *sjobn; int lj_done; bool schedule_fsync; ki = userp->p_aioinfo; AIO_LOCK_ASSERT(ki, MA_OWNED); lj = job->lio; lj_done = 0; if (lj) { lj->lioj_finished_count++; if (lj->lioj_count == lj->lioj_finished_count) lj_done = 1; } TAILQ_INSERT_TAIL(&ki->kaio_done, job, plist); MPASS(job->jobflags & KAIOCB_FINISHED); if (ki->kaio_flags & KAIO_RUNDOWN) goto notification_done; if (job->uaiocb.aio_sigevent.sigev_notify == SIGEV_SIGNAL || job->uaiocb.aio_sigevent.sigev_notify == SIGEV_THREAD_ID) aio_sendsig(userp, &job->uaiocb.aio_sigevent, &job->ksi); KNOTE_LOCKED(&job->klist, 1); if (lj_done) { if (lj->lioj_signal.sigev_notify == SIGEV_KEVENT) { lj->lioj_flags |= LIOJ_KEVENT_POSTED; KNOTE_LOCKED(&lj->klist, 1); } if ((lj->lioj_flags & (LIOJ_SIGNAL|LIOJ_SIGNAL_POSTED)) == LIOJ_SIGNAL && (lj->lioj_signal.sigev_notify == SIGEV_SIGNAL || lj->lioj_signal.sigev_notify == SIGEV_THREAD_ID)) { aio_sendsig(userp, &lj->lioj_signal, &lj->lioj_ksi); lj->lioj_flags |= LIOJ_SIGNAL_POSTED; } } notification_done: if (job->jobflags & KAIOCB_CHECKSYNC) { schedule_fsync = false; TAILQ_FOREACH_SAFE(sjob, &ki->kaio_syncqueue, list, sjobn) { if (job->fd_file != sjob->fd_file || job->seqno >= sjob->seqno) continue; if (--sjob->pending > 0) continue; TAILQ_REMOVE(&ki->kaio_syncqueue, sjob, list); if (!aio_clear_cancel_function_locked(sjob)) continue; TAILQ_INSERT_TAIL(&ki->kaio_syncready, sjob, list); schedule_fsync = true; } if (schedule_fsync) taskqueue_enqueue(taskqueue_aiod_kick, &ki->kaio_sync_task); } if (ki->kaio_flags & KAIO_WAKEUP) { ki->kaio_flags &= ~KAIO_WAKEUP; wakeup(&userp->p_aioinfo); } } static void aio_schedule_fsync(void *context, int pending) { struct kaioinfo *ki; struct kaiocb *job; ki = context; AIO_LOCK(ki); while (!TAILQ_EMPTY(&ki->kaio_syncready)) { job = TAILQ_FIRST(&ki->kaio_syncready); TAILQ_REMOVE(&ki->kaio_syncready, job, list); AIO_UNLOCK(ki); aio_schedule(job, aio_process_sync); AIO_LOCK(ki); } AIO_UNLOCK(ki); } bool aio_cancel_cleared(struct kaiocb *job) { /* * The caller should hold the same queue lock held when * aio_clear_cancel_function() was called and set this flag * ensuring this check sees an up-to-date value. However, * there is no way to assert that. */ return ((job->jobflags & KAIOCB_CLEARED) != 0); } static bool aio_clear_cancel_function_locked(struct kaiocb *job) { AIO_LOCK_ASSERT(job->userproc->p_aioinfo, MA_OWNED); MPASS(job->cancel_fn != NULL); if (job->jobflags & KAIOCB_CANCELLING) { job->jobflags |= KAIOCB_CLEARED; return (false); } job->cancel_fn = NULL; return (true); } bool aio_clear_cancel_function(struct kaiocb *job) { struct kaioinfo *ki; bool ret; ki = job->userproc->p_aioinfo; AIO_LOCK(ki); ret = aio_clear_cancel_function_locked(job); AIO_UNLOCK(ki); return (ret); } static bool aio_set_cancel_function_locked(struct kaiocb *job, aio_cancel_fn_t *func) { AIO_LOCK_ASSERT(job->userproc->p_aioinfo, MA_OWNED); if (job->jobflags & KAIOCB_CANCELLED) return (false); job->cancel_fn = func; return (true); } bool aio_set_cancel_function(struct kaiocb *job, aio_cancel_fn_t *func) { struct kaioinfo *ki; bool ret; ki = job->userproc->p_aioinfo; AIO_LOCK(ki); ret = aio_set_cancel_function_locked(job, func); AIO_UNLOCK(ki); return (ret); } void aio_complete(struct kaiocb *job, long status, int error) { struct kaioinfo *ki; struct proc *userp; job->uaiocb._aiocb_private.error = error; job->uaiocb._aiocb_private.status = status; userp = job->userproc; ki = userp->p_aioinfo; AIO_LOCK(ki); KASSERT(!(job->jobflags & KAIOCB_FINISHED), ("duplicate aio_complete")); job->jobflags |= KAIOCB_FINISHED; if ((job->jobflags & (KAIOCB_QUEUEING | KAIOCB_CANCELLING)) == 0) { TAILQ_REMOVE(&ki->kaio_jobqueue, job, plist); aio_bio_done_notify(userp, job); } AIO_UNLOCK(ki); } void aio_cancel(struct kaiocb *job) { aio_complete(job, -1, ECANCELED); } void aio_switch_vmspace(struct kaiocb *job) { vmspace_switch_aio(job->userproc->p_vmspace); } /* * The AIO daemon, most of the actual work is done in aio_process_*, * but the setup (and address space mgmt) is done in this routine. */ static void aio_daemon(void *_id) { struct kaiocb *job; struct aioproc *aiop; struct kaioinfo *ki; struct proc *p; struct vmspace *myvm; struct thread *td = curthread; int id = (intptr_t)_id; /* * Grab an extra reference on the daemon's vmspace so that it * doesn't get freed by jobs that switch to a different * vmspace. */ p = td->td_proc; myvm = vmspace_acquire_ref(p); KASSERT(p->p_textvp == NULL, ("kthread has a textvp")); /* * Allocate and ready the aio control info. There is one aiop structure * per daemon. */ aiop = uma_zalloc(aiop_zone, M_WAITOK); aiop->aioproc = p; aiop->aioprocflags = 0; /* * Wakeup parent process. (Parent sleeps to keep from blasting away * and creating too many daemons.) */ sema_post(&aio_newproc_sem); mtx_lock(&aio_job_mtx); for (;;) { /* * Take daemon off of free queue */ if (aiop->aioprocflags & AIOP_FREE) { TAILQ_REMOVE(&aio_freeproc, aiop, list); aiop->aioprocflags &= ~AIOP_FREE; } /* * Check for jobs. */ while ((job = aio_selectjob(aiop)) != NULL) { mtx_unlock(&aio_job_mtx); ki = job->userproc->p_aioinfo; job->handle_fn(job); mtx_lock(&aio_job_mtx); /* Decrement the active job count. */ ki->kaio_active_count--; } /* * Disconnect from user address space. */ if (p->p_vmspace != myvm) { mtx_unlock(&aio_job_mtx); vmspace_switch_aio(myvm); mtx_lock(&aio_job_mtx); /* * We have to restart to avoid race, we only sleep if * no job can be selected. */ continue; } mtx_assert(&aio_job_mtx, MA_OWNED); TAILQ_INSERT_HEAD(&aio_freeproc, aiop, list); aiop->aioprocflags |= AIOP_FREE; /* * If daemon is inactive for a long time, allow it to exit, * thereby freeing resources. */ if (msleep(p, &aio_job_mtx, PRIBIO, "aiordy", aiod_lifetime) == EWOULDBLOCK && TAILQ_EMPTY(&aio_jobs) && (aiop->aioprocflags & AIOP_FREE) && num_aio_procs > target_aio_procs) break; } TAILQ_REMOVE(&aio_freeproc, aiop, list); num_aio_procs--; mtx_unlock(&aio_job_mtx); uma_zfree(aiop_zone, aiop); free_unr(aiod_unr, id); vmspace_free(myvm); KASSERT(p->p_vmspace == myvm, ("AIOD: bad vmspace for exiting daemon")); KASSERT(myvm->vm_refcnt > 1, ("AIOD: bad vm refcnt for exiting daemon: %d", myvm->vm_refcnt)); kproc_exit(0); } /* * Create a new AIO daemon. This is mostly a kernel-thread fork routine. The * AIO daemon modifies its environment itself. */ static int aio_newproc(int *start) { int error; struct proc *p; int id; id = alloc_unr(aiod_unr); error = kproc_create(aio_daemon, (void *)(intptr_t)id, &p, RFNOWAIT, 0, "aiod%d", id); if (error == 0) { /* * Wait until daemon is started. */ sema_wait(&aio_newproc_sem); mtx_lock(&aio_job_mtx); num_aio_procs++; if (start != NULL) (*start)--; mtx_unlock(&aio_job_mtx); } else { free_unr(aiod_unr, id); } return (error); } /* * Try the high-performance, low-overhead bio method for eligible * VCHR devices. This method doesn't use an aio helper thread, and * thus has very low overhead. * * Assumes that the caller, aio_aqueue(), has incremented the file * structure's reference count, preventing its deallocation for the * duration of this call. */ static int aio_qbio(struct proc *p, struct kaiocb *job) { struct aiocb *cb; struct file *fp; struct bio *bp; struct buf *pbuf; struct vnode *vp; struct cdevsw *csw; struct cdev *dev; struct kaioinfo *ki; int error, ref, poff; vm_prot_t prot; cb = &job->uaiocb; fp = job->fd_file; if (!(cb->aio_lio_opcode == LIO_WRITE || cb->aio_lio_opcode == LIO_READ)) return (-1); if (fp == NULL || fp->f_type != DTYPE_VNODE) return (-1); vp = fp->f_vnode; if (vp->v_type != VCHR) return (-1); if (vp->v_bufobj.bo_bsize == 0) return (-1); if (cb->aio_nbytes % vp->v_bufobj.bo_bsize) return (-1); ref = 0; csw = devvn_refthread(vp, &dev, &ref); if (csw == NULL) return (ENXIO); if ((csw->d_flags & D_DISK) == 0) { error = -1; goto unref; } if (cb->aio_nbytes > dev->si_iosize_max) { error = -1; goto unref; } ki = p->p_aioinfo; poff = (vm_offset_t)cb->aio_buf & PAGE_MASK; if ((dev->si_flags & SI_UNMAPPED) && unmapped_buf_allowed) { if (cb->aio_nbytes > MAXPHYS) { error = -1; goto unref; } pbuf = NULL; } else { if (cb->aio_nbytes > MAXPHYS - poff) { error = -1; goto unref; } if (ki->kaio_buffer_count >= max_buf_aio) { error = EAGAIN; goto unref; } job->pbuf = pbuf = uma_zalloc(pbuf_zone, M_WAITOK); BUF_KERNPROC(pbuf); AIO_LOCK(ki); ki->kaio_buffer_count++; AIO_UNLOCK(ki); } job->bp = bp = g_alloc_bio(); bp->bio_length = cb->aio_nbytes; bp->bio_bcount = cb->aio_nbytes; bp->bio_done = aio_biowakeup; bp->bio_data = (void *)(uintptr_t)cb->aio_buf; bp->bio_offset = cb->aio_offset; bp->bio_cmd = cb->aio_lio_opcode == LIO_WRITE ? BIO_WRITE : BIO_READ; bp->bio_dev = dev; bp->bio_caller1 = (void *)job; prot = VM_PROT_READ; if (cb->aio_lio_opcode == LIO_READ) prot |= VM_PROT_WRITE; /* Less backwards than it looks */ job->npages = vm_fault_quick_hold_pages(&curproc->p_vmspace->vm_map, (vm_offset_t)bp->bio_data, bp->bio_length, prot, job->pages, nitems(job->pages)); if (job->npages < 0) { error = EFAULT; goto doerror; } if (pbuf != NULL) { pmap_qenter((vm_offset_t)pbuf->b_data, job->pages, job->npages); bp->bio_data = pbuf->b_data + poff; atomic_add_int(&num_buf_aio, 1); } else { bp->bio_ma = job->pages; bp->bio_ma_n = job->npages; bp->bio_ma_offset = poff; bp->bio_data = unmapped_buf; bp->bio_flags |= BIO_UNMAPPED; atomic_add_int(&num_unmapped_aio, 1); } /* Perform transfer. */ csw->d_strategy(bp); dev_relthread(dev, ref); return (0); doerror: if (pbuf != NULL) { AIO_LOCK(ki); ki->kaio_buffer_count--; AIO_UNLOCK(ki); uma_zfree(pbuf_zone, pbuf); job->pbuf = NULL; } g_destroy_bio(bp); job->bp = NULL; unref: dev_relthread(dev, ref); return (error); } #ifdef COMPAT_FREEBSD6 static int convert_old_sigevent(struct osigevent *osig, struct sigevent *nsig) { /* * Only SIGEV_NONE, SIGEV_SIGNAL, and SIGEV_KEVENT are * supported by AIO with the old sigevent structure. */ nsig->sigev_notify = osig->sigev_notify; switch (nsig->sigev_notify) { case SIGEV_NONE: break; case SIGEV_SIGNAL: nsig->sigev_signo = osig->__sigev_u.__sigev_signo; break; case SIGEV_KEVENT: nsig->sigev_notify_kqueue = osig->__sigev_u.__sigev_notify_kqueue; nsig->sigev_value.sival_ptr = osig->sigev_value.sival_ptr; break; default: return (EINVAL); } return (0); } static int aiocb_copyin_old_sigevent(struct aiocb *ujob, struct aiocb *kjob) { struct oaiocb *ojob; int error; bzero(kjob, sizeof(struct aiocb)); error = copyin(ujob, kjob, sizeof(struct oaiocb)); if (error) return (error); ojob = (struct oaiocb *)kjob; return (convert_old_sigevent(&ojob->aio_sigevent, &kjob->aio_sigevent)); } #endif static int aiocb_copyin(struct aiocb *ujob, struct aiocb *kjob) { return (copyin(ujob, kjob, sizeof(struct aiocb))); } static long aiocb_fetch_status(struct aiocb *ujob) { return (fuword(&ujob->_aiocb_private.status)); } static long aiocb_fetch_error(struct aiocb *ujob) { return (fuword(&ujob->_aiocb_private.error)); } static int aiocb_store_status(struct aiocb *ujob, long status) { return (suword(&ujob->_aiocb_private.status, status)); } static int aiocb_store_error(struct aiocb *ujob, long error) { return (suword(&ujob->_aiocb_private.error, error)); } static int aiocb_store_kernelinfo(struct aiocb *ujob, long jobref) { return (suword(&ujob->_aiocb_private.kernelinfo, jobref)); } static int aiocb_store_aiocb(struct aiocb **ujobp, struct aiocb *ujob) { return (suword(ujobp, (long)ujob)); } static struct aiocb_ops aiocb_ops = { .aio_copyin = aiocb_copyin, .fetch_status = aiocb_fetch_status, .fetch_error = aiocb_fetch_error, .store_status = aiocb_store_status, .store_error = aiocb_store_error, .store_kernelinfo = aiocb_store_kernelinfo, .store_aiocb = aiocb_store_aiocb, }; #ifdef COMPAT_FREEBSD6 static struct aiocb_ops aiocb_ops_osigevent = { .aio_copyin = aiocb_copyin_old_sigevent, .fetch_status = aiocb_fetch_status, .fetch_error = aiocb_fetch_error, .store_status = aiocb_store_status, .store_error = aiocb_store_error, .store_kernelinfo = aiocb_store_kernelinfo, .store_aiocb = aiocb_store_aiocb, }; #endif /* * Queue a new AIO request. Choosing either the threaded or direct bio VCHR * technique is done in this code. */ int aio_aqueue(struct thread *td, struct aiocb *ujob, struct aioliojob *lj, int type, struct aiocb_ops *ops) { struct proc *p = td->td_proc; struct file *fp; struct kaiocb *job; struct kaioinfo *ki; struct kevent kev; int opcode; int error; int fd, kqfd; int jid; u_short evflags; if (p->p_aioinfo == NULL) aio_init_aioinfo(p); ki = p->p_aioinfo; ops->store_status(ujob, -1); ops->store_error(ujob, 0); ops->store_kernelinfo(ujob, -1); if (num_queue_count >= max_queue_count || ki->kaio_count >= max_aio_queue_per_proc) { ops->store_error(ujob, EAGAIN); return (EAGAIN); } job = uma_zalloc(aiocb_zone, M_WAITOK | M_ZERO); knlist_init_mtx(&job->klist, AIO_MTX(ki)); error = ops->aio_copyin(ujob, &job->uaiocb); if (error) { ops->store_error(ujob, error); uma_zfree(aiocb_zone, job); return (error); } if (job->uaiocb.aio_nbytes > IOSIZE_MAX) { uma_zfree(aiocb_zone, job); return (EINVAL); } if (job->uaiocb.aio_sigevent.sigev_notify != SIGEV_KEVENT && job->uaiocb.aio_sigevent.sigev_notify != SIGEV_SIGNAL && job->uaiocb.aio_sigevent.sigev_notify != SIGEV_THREAD_ID && job->uaiocb.aio_sigevent.sigev_notify != SIGEV_NONE) { ops->store_error(ujob, EINVAL); uma_zfree(aiocb_zone, job); return (EINVAL); } if ((job->uaiocb.aio_sigevent.sigev_notify == SIGEV_SIGNAL || job->uaiocb.aio_sigevent.sigev_notify == SIGEV_THREAD_ID) && !_SIG_VALID(job->uaiocb.aio_sigevent.sigev_signo)) { uma_zfree(aiocb_zone, job); return (EINVAL); } ksiginfo_init(&job->ksi); /* Save userspace address of the job info. */ job->ujob = ujob; /* Get the opcode. */ if (type != LIO_NOP) job->uaiocb.aio_lio_opcode = type; opcode = job->uaiocb.aio_lio_opcode; /* * Validate the opcode and fetch the file object for the specified * file descriptor. * * XXXRW: Moved the opcode validation up here so that we don't * retrieve a file descriptor without knowing what the capabiltity * should be. */ fd = job->uaiocb.aio_fildes; switch (opcode) { case LIO_WRITE: error = fget_write(td, fd, &cap_pwrite_rights, &fp); break; case LIO_READ: error = fget_read(td, fd, &cap_pread_rights, &fp); break; case LIO_SYNC: error = fget(td, fd, &cap_fsync_rights, &fp); break; case LIO_MLOCK: fp = NULL; break; case LIO_NOP: error = fget(td, fd, &cap_no_rights, &fp); break; default: error = EINVAL; } if (error) { uma_zfree(aiocb_zone, job); ops->store_error(ujob, error); return (error); } if (opcode == LIO_SYNC && fp->f_vnode == NULL) { error = EINVAL; goto aqueue_fail; } if ((opcode == LIO_READ || opcode == LIO_WRITE) && job->uaiocb.aio_offset < 0 && (fp->f_vnode == NULL || fp->f_vnode->v_type != VCHR)) { error = EINVAL; goto aqueue_fail; } job->fd_file = fp; mtx_lock(&aio_job_mtx); jid = jobrefid++; job->seqno = jobseqno++; mtx_unlock(&aio_job_mtx); error = ops->store_kernelinfo(ujob, jid); if (error) { error = EINVAL; goto aqueue_fail; } job->uaiocb._aiocb_private.kernelinfo = (void *)(intptr_t)jid; if (opcode == LIO_NOP) { fdrop(fp, td); uma_zfree(aiocb_zone, job); return (0); } if (job->uaiocb.aio_sigevent.sigev_notify != SIGEV_KEVENT) goto no_kqueue; evflags = job->uaiocb.aio_sigevent.sigev_notify_kevent_flags; if ((evflags & ~(EV_CLEAR | EV_DISPATCH | EV_ONESHOT)) != 0) { error = EINVAL; goto aqueue_fail; } kqfd = job->uaiocb.aio_sigevent.sigev_notify_kqueue; memset(&kev, 0, sizeof(kev)); kev.ident = (uintptr_t)job->ujob; kev.filter = EVFILT_AIO; kev.flags = EV_ADD | EV_ENABLE | EV_FLAG1 | evflags; kev.data = (intptr_t)job; kev.udata = job->uaiocb.aio_sigevent.sigev_value.sival_ptr; error = kqfd_register(kqfd, &kev, td, M_WAITOK); if (error) goto aqueue_fail; no_kqueue: ops->store_error(ujob, EINPROGRESS); job->uaiocb._aiocb_private.error = EINPROGRESS; job->userproc = p; job->cred = crhold(td->td_ucred); job->jobflags = KAIOCB_QUEUEING; job->lio = lj; if (opcode == LIO_MLOCK) { aio_schedule(job, aio_process_mlock); error = 0; } else if (fp->f_ops->fo_aio_queue == NULL) error = aio_queue_file(fp, job); else error = fo_aio_queue(fp, job); if (error) goto aqueue_fail; AIO_LOCK(ki); job->jobflags &= ~KAIOCB_QUEUEING; TAILQ_INSERT_TAIL(&ki->kaio_all, job, allist); ki->kaio_count++; if (lj) lj->lioj_count++; atomic_add_int(&num_queue_count, 1); if (job->jobflags & KAIOCB_FINISHED) { /* * The queue callback completed the request synchronously. * The bulk of the completion is deferred in that case * until this point. */ aio_bio_done_notify(p, job); } else TAILQ_INSERT_TAIL(&ki->kaio_jobqueue, job, plist); AIO_UNLOCK(ki); return (0); aqueue_fail: knlist_delete(&job->klist, curthread, 0); if (fp) fdrop(fp, td); uma_zfree(aiocb_zone, job); ops->store_error(ujob, error); return (error); } static void aio_cancel_daemon_job(struct kaiocb *job) { mtx_lock(&aio_job_mtx); if (!aio_cancel_cleared(job)) TAILQ_REMOVE(&aio_jobs, job, list); mtx_unlock(&aio_job_mtx); aio_cancel(job); } void aio_schedule(struct kaiocb *job, aio_handle_fn_t *func) { mtx_lock(&aio_job_mtx); if (!aio_set_cancel_function(job, aio_cancel_daemon_job)) { mtx_unlock(&aio_job_mtx); aio_cancel(job); return; } job->handle_fn = func; TAILQ_INSERT_TAIL(&aio_jobs, job, list); aio_kick_nowait(job->userproc); mtx_unlock(&aio_job_mtx); } static void aio_cancel_sync(struct kaiocb *job) { struct kaioinfo *ki; ki = job->userproc->p_aioinfo; AIO_LOCK(ki); if (!aio_cancel_cleared(job)) TAILQ_REMOVE(&ki->kaio_syncqueue, job, list); AIO_UNLOCK(ki); aio_cancel(job); } int aio_queue_file(struct file *fp, struct kaiocb *job) { struct kaioinfo *ki; struct kaiocb *job2; struct vnode *vp; struct mount *mp; int error; bool safe; ki = job->userproc->p_aioinfo; error = aio_qbio(job->userproc, job); if (error >= 0) return (error); safe = false; if (fp->f_type == DTYPE_VNODE) { vp = fp->f_vnode; if (vp->v_type == VREG || vp->v_type == VDIR) { mp = fp->f_vnode->v_mount; if (mp == NULL || (mp->mnt_flag & MNT_LOCAL) != 0) safe = true; } } if (!(safe || enable_aio_unsafe)) { counted_warning(&unsafe_warningcnt, "is attempting to use unsafe AIO requests"); return (EOPNOTSUPP); } switch (job->uaiocb.aio_lio_opcode) { case LIO_READ: case LIO_WRITE: aio_schedule(job, aio_process_rw); error = 0; break; case LIO_SYNC: AIO_LOCK(ki); TAILQ_FOREACH(job2, &ki->kaio_jobqueue, plist) { if (job2->fd_file == job->fd_file && job2->uaiocb.aio_lio_opcode != LIO_SYNC && job2->seqno < job->seqno) { job2->jobflags |= KAIOCB_CHECKSYNC; job->pending++; } } if (job->pending != 0) { if (!aio_set_cancel_function_locked(job, aio_cancel_sync)) { AIO_UNLOCK(ki); aio_cancel(job); return (0); } TAILQ_INSERT_TAIL(&ki->kaio_syncqueue, job, list); AIO_UNLOCK(ki); return (0); } AIO_UNLOCK(ki); aio_schedule(job, aio_process_sync); error = 0; break; default: error = EINVAL; } return (error); } static void aio_kick_nowait(struct proc *userp) { struct kaioinfo *ki = userp->p_aioinfo; struct aioproc *aiop; mtx_assert(&aio_job_mtx, MA_OWNED); if ((aiop = TAILQ_FIRST(&aio_freeproc)) != NULL) { TAILQ_REMOVE(&aio_freeproc, aiop, list); aiop->aioprocflags &= ~AIOP_FREE; wakeup(aiop->aioproc); } else if (num_aio_resv_start + num_aio_procs < max_aio_procs && ki->kaio_active_count + num_aio_resv_start < max_aio_per_proc) { taskqueue_enqueue(taskqueue_aiod_kick, &ki->kaio_task); } } static int aio_kick(struct proc *userp) { struct kaioinfo *ki = userp->p_aioinfo; struct aioproc *aiop; int error, ret = 0; mtx_assert(&aio_job_mtx, MA_OWNED); retryproc: if ((aiop = TAILQ_FIRST(&aio_freeproc)) != NULL) { TAILQ_REMOVE(&aio_freeproc, aiop, list); aiop->aioprocflags &= ~AIOP_FREE; wakeup(aiop->aioproc); } else if (num_aio_resv_start + num_aio_procs < max_aio_procs && ki->kaio_active_count + num_aio_resv_start < max_aio_per_proc) { num_aio_resv_start++; mtx_unlock(&aio_job_mtx); error = aio_newproc(&num_aio_resv_start); mtx_lock(&aio_job_mtx); if (error) { num_aio_resv_start--; goto retryproc; } } else { ret = -1; } return (ret); } static void aio_kick_helper(void *context, int pending) { struct proc *userp = context; mtx_lock(&aio_job_mtx); while (--pending >= 0) { if (aio_kick(userp)) break; } mtx_unlock(&aio_job_mtx); } /* * Support the aio_return system call, as a side-effect, kernel resources are * released. */ static int kern_aio_return(struct thread *td, struct aiocb *ujob, struct aiocb_ops *ops) { struct proc *p = td->td_proc; struct kaiocb *job; struct kaioinfo *ki; long status, error; ki = p->p_aioinfo; if (ki == NULL) return (EINVAL); AIO_LOCK(ki); TAILQ_FOREACH(job, &ki->kaio_done, plist) { if (job->ujob == ujob) break; } if (job != NULL) { MPASS(job->jobflags & KAIOCB_FINISHED); status = job->uaiocb._aiocb_private.status; error = job->uaiocb._aiocb_private.error; td->td_retval[0] = status; td->td_ru.ru_oublock += job->outblock; td->td_ru.ru_inblock += job->inblock; td->td_ru.ru_msgsnd += job->msgsnd; td->td_ru.ru_msgrcv += job->msgrcv; aio_free_entry(job); AIO_UNLOCK(ki); ops->store_error(ujob, error); ops->store_status(ujob, status); } else { error = EINVAL; AIO_UNLOCK(ki); } return (error); } int sys_aio_return(struct thread *td, struct aio_return_args *uap) { return (kern_aio_return(td, uap->aiocbp, &aiocb_ops)); } /* * Allow a process to wakeup when any of the I/O requests are completed. */ static int kern_aio_suspend(struct thread *td, int njoblist, struct aiocb **ujoblist, struct timespec *ts) { struct proc *p = td->td_proc; struct timeval atv; struct kaioinfo *ki; struct kaiocb *firstjob, *job; int error, i, timo; timo = 0; if (ts) { if (ts->tv_nsec < 0 || ts->tv_nsec >= 1000000000) return (EINVAL); TIMESPEC_TO_TIMEVAL(&atv, ts); if (itimerfix(&atv)) return (EINVAL); timo = tvtohz(&atv); } ki = p->p_aioinfo; if (ki == NULL) return (EAGAIN); if (njoblist == 0) return (0); AIO_LOCK(ki); for (;;) { firstjob = NULL; error = 0; TAILQ_FOREACH(job, &ki->kaio_all, allist) { for (i = 0; i < njoblist; i++) { if (job->ujob == ujoblist[i]) { if (firstjob == NULL) firstjob = job; if (job->jobflags & KAIOCB_FINISHED) goto RETURN; } } } /* All tasks were finished. */ if (firstjob == NULL) break; ki->kaio_flags |= KAIO_WAKEUP; error = msleep(&p->p_aioinfo, AIO_MTX(ki), PRIBIO | PCATCH, "aiospn", timo); if (error == ERESTART) error = EINTR; if (error) break; } RETURN: AIO_UNLOCK(ki); return (error); } int sys_aio_suspend(struct thread *td, struct aio_suspend_args *uap) { struct timespec ts, *tsp; struct aiocb **ujoblist; int error; if (uap->nent < 0 || uap->nent > max_aio_queue_per_proc) return (EINVAL); if (uap->timeout) { /* Get timespec struct. */ if ((error = copyin(uap->timeout, &ts, sizeof(ts))) != 0) return (error); tsp = &ts; } else tsp = NULL; ujoblist = malloc(uap->nent * sizeof(ujoblist[0]), M_AIOS, M_WAITOK); error = copyin(uap->aiocbp, ujoblist, uap->nent * sizeof(ujoblist[0])); if (error == 0) error = kern_aio_suspend(td, uap->nent, ujoblist, tsp); free(ujoblist, M_AIOS); return (error); } /* * aio_cancel cancels any non-bio aio operations not currently in progress. */ int sys_aio_cancel(struct thread *td, struct aio_cancel_args *uap) { struct proc *p = td->td_proc; struct kaioinfo *ki; struct kaiocb *job, *jobn; struct file *fp; int error; int cancelled = 0; int notcancelled = 0; struct vnode *vp; /* Lookup file object. */ error = fget(td, uap->fd, &cap_no_rights, &fp); if (error) return (error); ki = p->p_aioinfo; if (ki == NULL) goto done; if (fp->f_type == DTYPE_VNODE) { vp = fp->f_vnode; - if (vn_isdisk(vp, &error)) { + if (vn_isdisk(vp)) { fdrop(fp, td); td->td_retval[0] = AIO_NOTCANCELED; return (0); } } AIO_LOCK(ki); TAILQ_FOREACH_SAFE(job, &ki->kaio_jobqueue, plist, jobn) { if ((uap->fd == job->uaiocb.aio_fildes) && ((uap->aiocbp == NULL) || (uap->aiocbp == job->ujob))) { if (aio_cancel_job(p, ki, job)) { cancelled++; } else { notcancelled++; } if (uap->aiocbp != NULL) break; } } AIO_UNLOCK(ki); done: fdrop(fp, td); if (uap->aiocbp != NULL) { if (cancelled) { td->td_retval[0] = AIO_CANCELED; return (0); } } if (notcancelled) { td->td_retval[0] = AIO_NOTCANCELED; return (0); } if (cancelled) { td->td_retval[0] = AIO_CANCELED; return (0); } td->td_retval[0] = AIO_ALLDONE; return (0); } /* * aio_error is implemented in the kernel level for compatibility purposes * only. For a user mode async implementation, it would be best to do it in * a userland subroutine. */ static int kern_aio_error(struct thread *td, struct aiocb *ujob, struct aiocb_ops *ops) { struct proc *p = td->td_proc; struct kaiocb *job; struct kaioinfo *ki; int status; ki = p->p_aioinfo; if (ki == NULL) { td->td_retval[0] = EINVAL; return (0); } AIO_LOCK(ki); TAILQ_FOREACH(job, &ki->kaio_all, allist) { if (job->ujob == ujob) { if (job->jobflags & KAIOCB_FINISHED) td->td_retval[0] = job->uaiocb._aiocb_private.error; else td->td_retval[0] = EINPROGRESS; AIO_UNLOCK(ki); return (0); } } AIO_UNLOCK(ki); /* * Hack for failure of aio_aqueue. */ status = ops->fetch_status(ujob); if (status == -1) { td->td_retval[0] = ops->fetch_error(ujob); return (0); } td->td_retval[0] = EINVAL; return (0); } int sys_aio_error(struct thread *td, struct aio_error_args *uap) { return (kern_aio_error(td, uap->aiocbp, &aiocb_ops)); } /* syscall - asynchronous read from a file (REALTIME) */ #ifdef COMPAT_FREEBSD6 int freebsd6_aio_read(struct thread *td, struct freebsd6_aio_read_args *uap) { return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_READ, &aiocb_ops_osigevent)); } #endif int sys_aio_read(struct thread *td, struct aio_read_args *uap) { return (aio_aqueue(td, uap->aiocbp, NULL, LIO_READ, &aiocb_ops)); } /* syscall - asynchronous write to a file (REALTIME) */ #ifdef COMPAT_FREEBSD6 int freebsd6_aio_write(struct thread *td, struct freebsd6_aio_write_args *uap) { return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_WRITE, &aiocb_ops_osigevent)); } #endif int sys_aio_write(struct thread *td, struct aio_write_args *uap) { return (aio_aqueue(td, uap->aiocbp, NULL, LIO_WRITE, &aiocb_ops)); } int sys_aio_mlock(struct thread *td, struct aio_mlock_args *uap) { return (aio_aqueue(td, uap->aiocbp, NULL, LIO_MLOCK, &aiocb_ops)); } static int kern_lio_listio(struct thread *td, int mode, struct aiocb * const *uacb_list, struct aiocb **acb_list, int nent, struct sigevent *sig, struct aiocb_ops *ops) { struct proc *p = td->td_proc; struct aiocb *job; struct kaioinfo *ki; struct aioliojob *lj; struct kevent kev; int error; int nagain, nerror; int i; if ((mode != LIO_NOWAIT) && (mode != LIO_WAIT)) return (EINVAL); if (nent < 0 || nent > max_aio_queue_per_proc) return (EINVAL); if (p->p_aioinfo == NULL) aio_init_aioinfo(p); ki = p->p_aioinfo; lj = uma_zalloc(aiolio_zone, M_WAITOK); lj->lioj_flags = 0; lj->lioj_count = 0; lj->lioj_finished_count = 0; knlist_init_mtx(&lj->klist, AIO_MTX(ki)); ksiginfo_init(&lj->lioj_ksi); /* * Setup signal. */ if (sig && (mode == LIO_NOWAIT)) { bcopy(sig, &lj->lioj_signal, sizeof(lj->lioj_signal)); if (lj->lioj_signal.sigev_notify == SIGEV_KEVENT) { /* Assume only new style KEVENT */ memset(&kev, 0, sizeof(kev)); kev.filter = EVFILT_LIO; kev.flags = EV_ADD | EV_ENABLE | EV_FLAG1; kev.ident = (uintptr_t)uacb_list; /* something unique */ kev.data = (intptr_t)lj; /* pass user defined sigval data */ kev.udata = lj->lioj_signal.sigev_value.sival_ptr; error = kqfd_register( lj->lioj_signal.sigev_notify_kqueue, &kev, td, M_WAITOK); if (error) { uma_zfree(aiolio_zone, lj); return (error); } } else if (lj->lioj_signal.sigev_notify == SIGEV_NONE) { ; } else if (lj->lioj_signal.sigev_notify == SIGEV_SIGNAL || lj->lioj_signal.sigev_notify == SIGEV_THREAD_ID) { if (!_SIG_VALID(lj->lioj_signal.sigev_signo)) { uma_zfree(aiolio_zone, lj); return EINVAL; } lj->lioj_flags |= LIOJ_SIGNAL; } else { uma_zfree(aiolio_zone, lj); return EINVAL; } } AIO_LOCK(ki); TAILQ_INSERT_TAIL(&ki->kaio_liojoblist, lj, lioj_list); /* * Add extra aiocb count to avoid the lio to be freed * by other threads doing aio_waitcomplete or aio_return, * and prevent event from being sent until we have queued * all tasks. */ lj->lioj_count = 1; AIO_UNLOCK(ki); /* * Get pointers to the list of I/O requests. */ nagain = 0; nerror = 0; for (i = 0; i < nent; i++) { job = acb_list[i]; if (job != NULL) { error = aio_aqueue(td, job, lj, LIO_NOP, ops); if (error == EAGAIN) nagain++; else if (error != 0) nerror++; } } error = 0; AIO_LOCK(ki); if (mode == LIO_WAIT) { while (lj->lioj_count - 1 != lj->lioj_finished_count) { ki->kaio_flags |= KAIO_WAKEUP; error = msleep(&p->p_aioinfo, AIO_MTX(ki), PRIBIO | PCATCH, "aiospn", 0); if (error == ERESTART) error = EINTR; if (error) break; } } else { if (lj->lioj_count - 1 == lj->lioj_finished_count) { if (lj->lioj_signal.sigev_notify == SIGEV_KEVENT) { lj->lioj_flags |= LIOJ_KEVENT_POSTED; KNOTE_LOCKED(&lj->klist, 1); } if ((lj->lioj_flags & (LIOJ_SIGNAL|LIOJ_SIGNAL_POSTED)) == LIOJ_SIGNAL && (lj->lioj_signal.sigev_notify == SIGEV_SIGNAL || lj->lioj_signal.sigev_notify == SIGEV_THREAD_ID)) { aio_sendsig(p, &lj->lioj_signal, &lj->lioj_ksi); lj->lioj_flags |= LIOJ_SIGNAL_POSTED; } } } lj->lioj_count--; if (lj->lioj_count == 0) { TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list); knlist_delete(&lj->klist, curthread, 1); PROC_LOCK(p); sigqueue_take(&lj->lioj_ksi); PROC_UNLOCK(p); AIO_UNLOCK(ki); uma_zfree(aiolio_zone, lj); } else AIO_UNLOCK(ki); if (nerror) return (EIO); else if (nagain) return (EAGAIN); else return (error); } /* syscall - list directed I/O (REALTIME) */ #ifdef COMPAT_FREEBSD6 int freebsd6_lio_listio(struct thread *td, struct freebsd6_lio_listio_args *uap) { struct aiocb **acb_list; struct sigevent *sigp, sig; struct osigevent osig; int error, nent; if ((uap->mode != LIO_NOWAIT) && (uap->mode != LIO_WAIT)) return (EINVAL); nent = uap->nent; if (nent < 0 || nent > max_aio_queue_per_proc) return (EINVAL); if (uap->sig && (uap->mode == LIO_NOWAIT)) { error = copyin(uap->sig, &osig, sizeof(osig)); if (error) return (error); error = convert_old_sigevent(&osig, &sig); if (error) return (error); sigp = &sig; } else sigp = NULL; acb_list = malloc(sizeof(struct aiocb *) * nent, M_LIO, M_WAITOK); error = copyin(uap->acb_list, acb_list, nent * sizeof(acb_list[0])); if (error == 0) error = kern_lio_listio(td, uap->mode, (struct aiocb * const *)uap->acb_list, acb_list, nent, sigp, &aiocb_ops_osigevent); free(acb_list, M_LIO); return (error); } #endif /* syscall - list directed I/O (REALTIME) */ int sys_lio_listio(struct thread *td, struct lio_listio_args *uap) { struct aiocb **acb_list; struct sigevent *sigp, sig; int error, nent; if ((uap->mode != LIO_NOWAIT) && (uap->mode != LIO_WAIT)) return (EINVAL); nent = uap->nent; if (nent < 0 || nent > max_aio_queue_per_proc) return (EINVAL); if (uap->sig && (uap->mode == LIO_NOWAIT)) { error = copyin(uap->sig, &sig, sizeof(sig)); if (error) return (error); sigp = &sig; } else sigp = NULL; acb_list = malloc(sizeof(struct aiocb *) * nent, M_LIO, M_WAITOK); error = copyin(uap->acb_list, acb_list, nent * sizeof(acb_list[0])); if (error == 0) error = kern_lio_listio(td, uap->mode, uap->acb_list, acb_list, nent, sigp, &aiocb_ops); free(acb_list, M_LIO); return (error); } static void aio_biowakeup(struct bio *bp) { struct kaiocb *job = (struct kaiocb *)bp->bio_caller1; struct proc *userp; struct kaioinfo *ki; size_t nbytes; int error, nblks; /* Release mapping into kernel space. */ userp = job->userproc; ki = userp->p_aioinfo; if (job->pbuf) { pmap_qremove((vm_offset_t)job->pbuf->b_data, job->npages); uma_zfree(pbuf_zone, job->pbuf); job->pbuf = NULL; atomic_subtract_int(&num_buf_aio, 1); AIO_LOCK(ki); ki->kaio_buffer_count--; AIO_UNLOCK(ki); } else atomic_subtract_int(&num_unmapped_aio, 1); vm_page_unhold_pages(job->pages, job->npages); bp = job->bp; job->bp = NULL; nbytes = job->uaiocb.aio_nbytes - bp->bio_resid; error = 0; if (bp->bio_flags & BIO_ERROR) error = bp->bio_error; nblks = btodb(nbytes); if (job->uaiocb.aio_lio_opcode == LIO_WRITE) job->outblock += nblks; else job->inblock += nblks; if (error) aio_complete(job, -1, error); else aio_complete(job, nbytes, 0); g_destroy_bio(bp); } /* syscall - wait for the next completion of an aio request */ static int kern_aio_waitcomplete(struct thread *td, struct aiocb **ujobp, struct timespec *ts, struct aiocb_ops *ops) { struct proc *p = td->td_proc; struct timeval atv; struct kaioinfo *ki; struct kaiocb *job; struct aiocb *ujob; long error, status; int timo; ops->store_aiocb(ujobp, NULL); if (ts == NULL) { timo = 0; } else if (ts->tv_sec == 0 && ts->tv_nsec == 0) { timo = -1; } else { if ((ts->tv_nsec < 0) || (ts->tv_nsec >= 1000000000)) return (EINVAL); TIMESPEC_TO_TIMEVAL(&atv, ts); if (itimerfix(&atv)) return (EINVAL); timo = tvtohz(&atv); } if (p->p_aioinfo == NULL) aio_init_aioinfo(p); ki = p->p_aioinfo; error = 0; job = NULL; AIO_LOCK(ki); while ((job = TAILQ_FIRST(&ki->kaio_done)) == NULL) { if (timo == -1) { error = EWOULDBLOCK; break; } ki->kaio_flags |= KAIO_WAKEUP; error = msleep(&p->p_aioinfo, AIO_MTX(ki), PRIBIO | PCATCH, "aiowc", timo); if (timo && error == ERESTART) error = EINTR; if (error) break; } if (job != NULL) { MPASS(job->jobflags & KAIOCB_FINISHED); ujob = job->ujob; status = job->uaiocb._aiocb_private.status; error = job->uaiocb._aiocb_private.error; td->td_retval[0] = status; td->td_ru.ru_oublock += job->outblock; td->td_ru.ru_inblock += job->inblock; td->td_ru.ru_msgsnd += job->msgsnd; td->td_ru.ru_msgrcv += job->msgrcv; aio_free_entry(job); AIO_UNLOCK(ki); ops->store_aiocb(ujobp, ujob); ops->store_error(ujob, error); ops->store_status(ujob, status); } else AIO_UNLOCK(ki); return (error); } int sys_aio_waitcomplete(struct thread *td, struct aio_waitcomplete_args *uap) { struct timespec ts, *tsp; int error; if (uap->timeout) { /* Get timespec struct. */ error = copyin(uap->timeout, &ts, sizeof(ts)); if (error) return (error); tsp = &ts; } else tsp = NULL; return (kern_aio_waitcomplete(td, uap->aiocbp, tsp, &aiocb_ops)); } static int kern_aio_fsync(struct thread *td, int op, struct aiocb *ujob, struct aiocb_ops *ops) { if (op != O_SYNC) /* XXX lack of O_DSYNC */ return (EINVAL); return (aio_aqueue(td, ujob, NULL, LIO_SYNC, ops)); } int sys_aio_fsync(struct thread *td, struct aio_fsync_args *uap) { return (kern_aio_fsync(td, uap->op, uap->aiocbp, &aiocb_ops)); } /* kqueue attach function */ static int filt_aioattach(struct knote *kn) { struct kaiocb *job; job = (struct kaiocb *)(uintptr_t)kn->kn_sdata; /* * The job pointer must be validated before using it, so * registration is restricted to the kernel; the user cannot * set EV_FLAG1. */ if ((kn->kn_flags & EV_FLAG1) == 0) return (EPERM); kn->kn_ptr.p_aio = job; kn->kn_flags &= ~EV_FLAG1; knlist_add(&job->klist, kn, 0); return (0); } /* kqueue detach function */ static void filt_aiodetach(struct knote *kn) { struct knlist *knl; knl = &kn->kn_ptr.p_aio->klist; knl->kl_lock(knl->kl_lockarg); if (!knlist_empty(knl)) knlist_remove(knl, kn, 1); knl->kl_unlock(knl->kl_lockarg); } /* kqueue filter function */ /*ARGSUSED*/ static int filt_aio(struct knote *kn, long hint) { struct kaiocb *job = kn->kn_ptr.p_aio; kn->kn_data = job->uaiocb._aiocb_private.error; if (!(job->jobflags & KAIOCB_FINISHED)) return (0); kn->kn_flags |= EV_EOF; return (1); } /* kqueue attach function */ static int filt_lioattach(struct knote *kn) { struct aioliojob *lj; lj = (struct aioliojob *)(uintptr_t)kn->kn_sdata; /* * The aioliojob pointer must be validated before using it, so * registration is restricted to the kernel; the user cannot * set EV_FLAG1. */ if ((kn->kn_flags & EV_FLAG1) == 0) return (EPERM); kn->kn_ptr.p_lio = lj; kn->kn_flags &= ~EV_FLAG1; knlist_add(&lj->klist, kn, 0); return (0); } /* kqueue detach function */ static void filt_liodetach(struct knote *kn) { struct knlist *knl; knl = &kn->kn_ptr.p_lio->klist; knl->kl_lock(knl->kl_lockarg); if (!knlist_empty(knl)) knlist_remove(knl, kn, 1); knl->kl_unlock(knl->kl_lockarg); } /* kqueue filter function */ /*ARGSUSED*/ static int filt_lio(struct knote *kn, long hint) { struct aioliojob * lj = kn->kn_ptr.p_lio; return (lj->lioj_flags & LIOJ_KEVENT_POSTED); } #ifdef COMPAT_FREEBSD32 #include #include #include #include #include #include #include struct __aiocb_private32 { int32_t status; int32_t error; uint32_t kernelinfo; }; #ifdef COMPAT_FREEBSD6 typedef struct oaiocb32 { int aio_fildes; /* File descriptor */ uint64_t aio_offset __packed; /* File offset for I/O */ uint32_t aio_buf; /* I/O buffer in process space */ uint32_t aio_nbytes; /* Number of bytes for I/O */ struct osigevent32 aio_sigevent; /* Signal to deliver */ int aio_lio_opcode; /* LIO opcode */ int aio_reqprio; /* Request priority -- ignored */ struct __aiocb_private32 _aiocb_private; } oaiocb32_t; #endif typedef struct aiocb32 { int32_t aio_fildes; /* File descriptor */ uint64_t aio_offset __packed; /* File offset for I/O */ uint32_t aio_buf; /* I/O buffer in process space */ uint32_t aio_nbytes; /* Number of bytes for I/O */ int __spare__[2]; uint32_t __spare2__; int aio_lio_opcode; /* LIO opcode */ int aio_reqprio; /* Request priority -- ignored */ struct __aiocb_private32 _aiocb_private; struct sigevent32 aio_sigevent; /* Signal to deliver */ } aiocb32_t; #ifdef COMPAT_FREEBSD6 static int convert_old_sigevent32(struct osigevent32 *osig, struct sigevent *nsig) { /* * Only SIGEV_NONE, SIGEV_SIGNAL, and SIGEV_KEVENT are * supported by AIO with the old sigevent structure. */ CP(*osig, *nsig, sigev_notify); switch (nsig->sigev_notify) { case SIGEV_NONE: break; case SIGEV_SIGNAL: nsig->sigev_signo = osig->__sigev_u.__sigev_signo; break; case SIGEV_KEVENT: nsig->sigev_notify_kqueue = osig->__sigev_u.__sigev_notify_kqueue; PTRIN_CP(*osig, *nsig, sigev_value.sival_ptr); break; default: return (EINVAL); } return (0); } static int aiocb32_copyin_old_sigevent(struct aiocb *ujob, struct aiocb *kjob) { struct oaiocb32 job32; int error; bzero(kjob, sizeof(struct aiocb)); error = copyin(ujob, &job32, sizeof(job32)); if (error) return (error); CP(job32, *kjob, aio_fildes); CP(job32, *kjob, aio_offset); PTRIN_CP(job32, *kjob, aio_buf); CP(job32, *kjob, aio_nbytes); CP(job32, *kjob, aio_lio_opcode); CP(job32, *kjob, aio_reqprio); CP(job32, *kjob, _aiocb_private.status); CP(job32, *kjob, _aiocb_private.error); PTRIN_CP(job32, *kjob, _aiocb_private.kernelinfo); return (convert_old_sigevent32(&job32.aio_sigevent, &kjob->aio_sigevent)); } #endif static int aiocb32_copyin(struct aiocb *ujob, struct aiocb *kjob) { struct aiocb32 job32; int error; error = copyin(ujob, &job32, sizeof(job32)); if (error) return (error); CP(job32, *kjob, aio_fildes); CP(job32, *kjob, aio_offset); PTRIN_CP(job32, *kjob, aio_buf); CP(job32, *kjob, aio_nbytes); CP(job32, *kjob, aio_lio_opcode); CP(job32, *kjob, aio_reqprio); CP(job32, *kjob, _aiocb_private.status); CP(job32, *kjob, _aiocb_private.error); PTRIN_CP(job32, *kjob, _aiocb_private.kernelinfo); return (convert_sigevent32(&job32.aio_sigevent, &kjob->aio_sigevent)); } static long aiocb32_fetch_status(struct aiocb *ujob) { struct aiocb32 *ujob32; ujob32 = (struct aiocb32 *)ujob; return (fuword32(&ujob32->_aiocb_private.status)); } static long aiocb32_fetch_error(struct aiocb *ujob) { struct aiocb32 *ujob32; ujob32 = (struct aiocb32 *)ujob; return (fuword32(&ujob32->_aiocb_private.error)); } static int aiocb32_store_status(struct aiocb *ujob, long status) { struct aiocb32 *ujob32; ujob32 = (struct aiocb32 *)ujob; return (suword32(&ujob32->_aiocb_private.status, status)); } static int aiocb32_store_error(struct aiocb *ujob, long error) { struct aiocb32 *ujob32; ujob32 = (struct aiocb32 *)ujob; return (suword32(&ujob32->_aiocb_private.error, error)); } static int aiocb32_store_kernelinfo(struct aiocb *ujob, long jobref) { struct aiocb32 *ujob32; ujob32 = (struct aiocb32 *)ujob; return (suword32(&ujob32->_aiocb_private.kernelinfo, jobref)); } static int aiocb32_store_aiocb(struct aiocb **ujobp, struct aiocb *ujob) { return (suword32(ujobp, (long)ujob)); } static struct aiocb_ops aiocb32_ops = { .aio_copyin = aiocb32_copyin, .fetch_status = aiocb32_fetch_status, .fetch_error = aiocb32_fetch_error, .store_status = aiocb32_store_status, .store_error = aiocb32_store_error, .store_kernelinfo = aiocb32_store_kernelinfo, .store_aiocb = aiocb32_store_aiocb, }; #ifdef COMPAT_FREEBSD6 static struct aiocb_ops aiocb32_ops_osigevent = { .aio_copyin = aiocb32_copyin_old_sigevent, .fetch_status = aiocb32_fetch_status, .fetch_error = aiocb32_fetch_error, .store_status = aiocb32_store_status, .store_error = aiocb32_store_error, .store_kernelinfo = aiocb32_store_kernelinfo, .store_aiocb = aiocb32_store_aiocb, }; #endif int freebsd32_aio_return(struct thread *td, struct freebsd32_aio_return_args *uap) { return (kern_aio_return(td, (struct aiocb *)uap->aiocbp, &aiocb32_ops)); } int freebsd32_aio_suspend(struct thread *td, struct freebsd32_aio_suspend_args *uap) { struct timespec32 ts32; struct timespec ts, *tsp; struct aiocb **ujoblist; uint32_t *ujoblist32; int error, i; if (uap->nent < 0 || uap->nent > max_aio_queue_per_proc) return (EINVAL); if (uap->timeout) { /* Get timespec struct. */ if ((error = copyin(uap->timeout, &ts32, sizeof(ts32))) != 0) return (error); CP(ts32, ts, tv_sec); CP(ts32, ts, tv_nsec); tsp = &ts; } else tsp = NULL; ujoblist = malloc(uap->nent * sizeof(ujoblist[0]), M_AIOS, M_WAITOK); ujoblist32 = (uint32_t *)ujoblist; error = copyin(uap->aiocbp, ujoblist32, uap->nent * sizeof(ujoblist32[0])); if (error == 0) { for (i = uap->nent - 1; i >= 0; i--) ujoblist[i] = PTRIN(ujoblist32[i]); error = kern_aio_suspend(td, uap->nent, ujoblist, tsp); } free(ujoblist, M_AIOS); return (error); } int freebsd32_aio_error(struct thread *td, struct freebsd32_aio_error_args *uap) { return (kern_aio_error(td, (struct aiocb *)uap->aiocbp, &aiocb32_ops)); } #ifdef COMPAT_FREEBSD6 int freebsd6_freebsd32_aio_read(struct thread *td, struct freebsd6_freebsd32_aio_read_args *uap) { return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_READ, &aiocb32_ops_osigevent)); } #endif int freebsd32_aio_read(struct thread *td, struct freebsd32_aio_read_args *uap) { return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_READ, &aiocb32_ops)); } #ifdef COMPAT_FREEBSD6 int freebsd6_freebsd32_aio_write(struct thread *td, struct freebsd6_freebsd32_aio_write_args *uap) { return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_WRITE, &aiocb32_ops_osigevent)); } #endif int freebsd32_aio_write(struct thread *td, struct freebsd32_aio_write_args *uap) { return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_WRITE, &aiocb32_ops)); } int freebsd32_aio_mlock(struct thread *td, struct freebsd32_aio_mlock_args *uap) { return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_MLOCK, &aiocb32_ops)); } int freebsd32_aio_waitcomplete(struct thread *td, struct freebsd32_aio_waitcomplete_args *uap) { struct timespec32 ts32; struct timespec ts, *tsp; int error; if (uap->timeout) { /* Get timespec struct. */ error = copyin(uap->timeout, &ts32, sizeof(ts32)); if (error) return (error); CP(ts32, ts, tv_sec); CP(ts32, ts, tv_nsec); tsp = &ts; } else tsp = NULL; return (kern_aio_waitcomplete(td, (struct aiocb **)uap->aiocbp, tsp, &aiocb32_ops)); } int freebsd32_aio_fsync(struct thread *td, struct freebsd32_aio_fsync_args *uap) { return (kern_aio_fsync(td, uap->op, (struct aiocb *)uap->aiocbp, &aiocb32_ops)); } #ifdef COMPAT_FREEBSD6 int freebsd6_freebsd32_lio_listio(struct thread *td, struct freebsd6_freebsd32_lio_listio_args *uap) { struct aiocb **acb_list; struct sigevent *sigp, sig; struct osigevent32 osig; uint32_t *acb_list32; int error, i, nent; if ((uap->mode != LIO_NOWAIT) && (uap->mode != LIO_WAIT)) return (EINVAL); nent = uap->nent; if (nent < 0 || nent > max_aio_queue_per_proc) return (EINVAL); if (uap->sig && (uap->mode == LIO_NOWAIT)) { error = copyin(uap->sig, &osig, sizeof(osig)); if (error) return (error); error = convert_old_sigevent32(&osig, &sig); if (error) return (error); sigp = &sig; } else sigp = NULL; acb_list32 = malloc(sizeof(uint32_t) * nent, M_LIO, M_WAITOK); error = copyin(uap->acb_list, acb_list32, nent * sizeof(uint32_t)); if (error) { free(acb_list32, M_LIO); return (error); } acb_list = malloc(sizeof(struct aiocb *) * nent, M_LIO, M_WAITOK); for (i = 0; i < nent; i++) acb_list[i] = PTRIN(acb_list32[i]); free(acb_list32, M_LIO); error = kern_lio_listio(td, uap->mode, (struct aiocb * const *)uap->acb_list, acb_list, nent, sigp, &aiocb32_ops_osigevent); free(acb_list, M_LIO); return (error); } #endif int freebsd32_lio_listio(struct thread *td, struct freebsd32_lio_listio_args *uap) { struct aiocb **acb_list; struct sigevent *sigp, sig; struct sigevent32 sig32; uint32_t *acb_list32; int error, i, nent; if ((uap->mode != LIO_NOWAIT) && (uap->mode != LIO_WAIT)) return (EINVAL); nent = uap->nent; if (nent < 0 || nent > max_aio_queue_per_proc) return (EINVAL); if (uap->sig && (uap->mode == LIO_NOWAIT)) { error = copyin(uap->sig, &sig32, sizeof(sig32)); if (error) return (error); error = convert_sigevent32(&sig32, &sig); if (error) return (error); sigp = &sig; } else sigp = NULL; acb_list32 = malloc(sizeof(uint32_t) * nent, M_LIO, M_WAITOK); error = copyin(uap->acb_list, acb_list32, nent * sizeof(uint32_t)); if (error) { free(acb_list32, M_LIO); return (error); } acb_list = malloc(sizeof(struct aiocb *) * nent, M_LIO, M_WAITOK); for (i = 0; i < nent; i++) acb_list[i] = PTRIN(acb_list32[i]); free(acb_list32, M_LIO); error = kern_lio_listio(td, uap->mode, (struct aiocb * const *)uap->acb_list, acb_list, nent, sigp, &aiocb32_ops); free(acb_list, M_LIO); return (error); } #endif Index: head/sys/kern/vfs_bio.c =================================================================== --- head/sys/kern/vfs_bio.c (revision 364371) +++ head/sys/kern/vfs_bio.c (revision 364372) @@ -1,5475 +1,5475 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2004 Poul-Henning Kamp * Copyright (c) 1994,1997 John S. Dyson * Copyright (c) 2013 The FreeBSD Foundation * All rights reserved. * * Portions of this software were developed by Konstantin Belousov * under sponsorship from the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * this file contains a new buffer I/O scheme implementing a coherent * VM object and buffer cache scheme. Pains have been taken to make * sure that the performance degradation associated with schemes such * as this is not realized. * * Author: John S. Dyson * Significant help during the development and debugging phases * had been provided by David Greenman, also of the FreeBSD core team. * * see man buf(9) for more info. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static MALLOC_DEFINE(M_BIOBUF, "biobuf", "BIO buffer"); struct bio_ops bioops; /* I/O operation notification */ struct buf_ops buf_ops_bio = { .bop_name = "buf_ops_bio", .bop_write = bufwrite, .bop_strategy = bufstrategy, .bop_sync = bufsync, .bop_bdflush = bufbdflush, }; struct bufqueue { struct mtx_padalign bq_lock; TAILQ_HEAD(, buf) bq_queue; uint8_t bq_index; uint16_t bq_subqueue; int bq_len; } __aligned(CACHE_LINE_SIZE); #define BQ_LOCKPTR(bq) (&(bq)->bq_lock) #define BQ_LOCK(bq) mtx_lock(BQ_LOCKPTR((bq))) #define BQ_UNLOCK(bq) mtx_unlock(BQ_LOCKPTR((bq))) #define BQ_ASSERT_LOCKED(bq) mtx_assert(BQ_LOCKPTR((bq)), MA_OWNED) struct bufdomain { struct bufqueue bd_subq[MAXCPU + 1]; /* Per-cpu sub queues + global */ struct bufqueue bd_dirtyq; struct bufqueue *bd_cleanq; struct mtx_padalign bd_run_lock; /* Constants */ long bd_maxbufspace; long bd_hibufspace; long bd_lobufspace; long bd_bufspacethresh; int bd_hifreebuffers; int bd_lofreebuffers; int bd_hidirtybuffers; int bd_lodirtybuffers; int bd_dirtybufthresh; int bd_lim; /* atomics */ int bd_wanted; int __aligned(CACHE_LINE_SIZE) bd_numdirtybuffers; int __aligned(CACHE_LINE_SIZE) bd_running; long __aligned(CACHE_LINE_SIZE) bd_bufspace; int __aligned(CACHE_LINE_SIZE) bd_freebuffers; } __aligned(CACHE_LINE_SIZE); #define BD_LOCKPTR(bd) (&(bd)->bd_cleanq->bq_lock) #define BD_LOCK(bd) mtx_lock(BD_LOCKPTR((bd))) #define BD_UNLOCK(bd) mtx_unlock(BD_LOCKPTR((bd))) #define BD_ASSERT_LOCKED(bd) mtx_assert(BD_LOCKPTR((bd)), MA_OWNED) #define BD_RUN_LOCKPTR(bd) (&(bd)->bd_run_lock) #define BD_RUN_LOCK(bd) mtx_lock(BD_RUN_LOCKPTR((bd))) #define BD_RUN_UNLOCK(bd) mtx_unlock(BD_RUN_LOCKPTR((bd))) #define BD_DOMAIN(bd) (bd - bdomain) static struct buf *buf; /* buffer header pool */ extern struct buf *swbuf; /* Swap buffer header pool. */ caddr_t __read_mostly unmapped_buf; /* Used below and for softdep flushing threads in ufs/ffs/ffs_softdep.c */ struct proc *bufdaemonproc; static int inmem(struct vnode *vp, daddr_t blkno); static void vm_hold_free_pages(struct buf *bp, int newbsize); static void vm_hold_load_pages(struct buf *bp, vm_offset_t from, vm_offset_t to); static void vfs_page_set_valid(struct buf *bp, vm_ooffset_t off, vm_page_t m); static void vfs_page_set_validclean(struct buf *bp, vm_ooffset_t off, vm_page_t m); static void vfs_clean_pages_dirty_buf(struct buf *bp); static void vfs_setdirty_range(struct buf *bp); static void vfs_vmio_invalidate(struct buf *bp); static void vfs_vmio_truncate(struct buf *bp, int npages); static void vfs_vmio_extend(struct buf *bp, int npages, int size); static int vfs_bio_clcheck(struct vnode *vp, int size, daddr_t lblkno, daddr_t blkno); static void breada(struct vnode *, daddr_t *, int *, int, struct ucred *, int, void (*)(struct buf *)); static int buf_flush(struct vnode *vp, struct bufdomain *, int); static int flushbufqueues(struct vnode *, struct bufdomain *, int, int); static void buf_daemon(void); static __inline void bd_wakeup(void); static int sysctl_runningspace(SYSCTL_HANDLER_ARGS); static void bufkva_reclaim(vmem_t *, int); static void bufkva_free(struct buf *); static int buf_import(void *, void **, int, int, int); static void buf_release(void *, void **, int); static void maxbcachebuf_adjust(void); static inline struct bufdomain *bufdomain(struct buf *); static void bq_remove(struct bufqueue *bq, struct buf *bp); static void bq_insert(struct bufqueue *bq, struct buf *bp, bool unlock); static int buf_recycle(struct bufdomain *, bool kva); static void bq_init(struct bufqueue *bq, int qindex, int cpu, const char *lockname); static void bd_init(struct bufdomain *bd); static int bd_flushall(struct bufdomain *bd); static int sysctl_bufdomain_long(SYSCTL_HANDLER_ARGS); static int sysctl_bufdomain_int(SYSCTL_HANDLER_ARGS); static int sysctl_bufspace(SYSCTL_HANDLER_ARGS); int vmiodirenable = TRUE; SYSCTL_INT(_vfs, OID_AUTO, vmiodirenable, CTLFLAG_RW, &vmiodirenable, 0, "Use the VM system for directory writes"); long runningbufspace; SYSCTL_LONG(_vfs, OID_AUTO, runningbufspace, CTLFLAG_RD, &runningbufspace, 0, "Amount of presently outstanding async buffer io"); SYSCTL_PROC(_vfs, OID_AUTO, bufspace, CTLTYPE_LONG|CTLFLAG_MPSAFE|CTLFLAG_RD, NULL, 0, sysctl_bufspace, "L", "Physical memory used for buffers"); static counter_u64_t bufkvaspace; SYSCTL_COUNTER_U64(_vfs, OID_AUTO, bufkvaspace, CTLFLAG_RD, &bufkvaspace, "Kernel virtual memory used for buffers"); static long maxbufspace; SYSCTL_PROC(_vfs, OID_AUTO, maxbufspace, CTLTYPE_LONG|CTLFLAG_MPSAFE|CTLFLAG_RW, &maxbufspace, __offsetof(struct bufdomain, bd_maxbufspace), sysctl_bufdomain_long, "L", "Maximum allowed value of bufspace (including metadata)"); static long bufmallocspace; SYSCTL_LONG(_vfs, OID_AUTO, bufmallocspace, CTLFLAG_RD, &bufmallocspace, 0, "Amount of malloced memory for buffers"); static long maxbufmallocspace; SYSCTL_LONG(_vfs, OID_AUTO, maxmallocbufspace, CTLFLAG_RW, &maxbufmallocspace, 0, "Maximum amount of malloced memory for buffers"); static long lobufspace; SYSCTL_PROC(_vfs, OID_AUTO, lobufspace, CTLTYPE_LONG|CTLFLAG_MPSAFE|CTLFLAG_RW, &lobufspace, __offsetof(struct bufdomain, bd_lobufspace), sysctl_bufdomain_long, "L", "Minimum amount of buffers we want to have"); long hibufspace; SYSCTL_PROC(_vfs, OID_AUTO, hibufspace, CTLTYPE_LONG|CTLFLAG_MPSAFE|CTLFLAG_RW, &hibufspace, __offsetof(struct bufdomain, bd_hibufspace), sysctl_bufdomain_long, "L", "Maximum allowed value of bufspace (excluding metadata)"); long bufspacethresh; SYSCTL_PROC(_vfs, OID_AUTO, bufspacethresh, CTLTYPE_LONG|CTLFLAG_MPSAFE|CTLFLAG_RW, &bufspacethresh, __offsetof(struct bufdomain, bd_bufspacethresh), sysctl_bufdomain_long, "L", "Bufspace consumed before waking the daemon to free some"); static counter_u64_t buffreekvacnt; SYSCTL_COUNTER_U64(_vfs, OID_AUTO, buffreekvacnt, CTLFLAG_RW, &buffreekvacnt, "Number of times we have freed the KVA space from some buffer"); static counter_u64_t bufdefragcnt; SYSCTL_COUNTER_U64(_vfs, OID_AUTO, bufdefragcnt, CTLFLAG_RW, &bufdefragcnt, "Number of times we have had to repeat buffer allocation to defragment"); static long lorunningspace; SYSCTL_PROC(_vfs, OID_AUTO, lorunningspace, CTLTYPE_LONG | CTLFLAG_MPSAFE | CTLFLAG_RW, &lorunningspace, 0, sysctl_runningspace, "L", "Minimum preferred space used for in-progress I/O"); static long hirunningspace; SYSCTL_PROC(_vfs, OID_AUTO, hirunningspace, CTLTYPE_LONG | CTLFLAG_MPSAFE | CTLFLAG_RW, &hirunningspace, 0, sysctl_runningspace, "L", "Maximum amount of space to use for in-progress I/O"); int dirtybufferflushes; SYSCTL_INT(_vfs, OID_AUTO, dirtybufferflushes, CTLFLAG_RW, &dirtybufferflushes, 0, "Number of bdwrite to bawrite conversions to limit dirty buffers"); int bdwriteskip; SYSCTL_INT(_vfs, OID_AUTO, bdwriteskip, CTLFLAG_RW, &bdwriteskip, 0, "Number of buffers supplied to bdwrite with snapshot deadlock risk"); int altbufferflushes; SYSCTL_INT(_vfs, OID_AUTO, altbufferflushes, CTLFLAG_RW | CTLFLAG_STATS, &altbufferflushes, 0, "Number of fsync flushes to limit dirty buffers"); static int recursiveflushes; SYSCTL_INT(_vfs, OID_AUTO, recursiveflushes, CTLFLAG_RW | CTLFLAG_STATS, &recursiveflushes, 0, "Number of flushes skipped due to being recursive"); static int sysctl_numdirtybuffers(SYSCTL_HANDLER_ARGS); SYSCTL_PROC(_vfs, OID_AUTO, numdirtybuffers, CTLTYPE_INT|CTLFLAG_MPSAFE|CTLFLAG_RD, NULL, 0, sysctl_numdirtybuffers, "I", "Number of buffers that are dirty (has unwritten changes) at the moment"); static int lodirtybuffers; SYSCTL_PROC(_vfs, OID_AUTO, lodirtybuffers, CTLTYPE_INT|CTLFLAG_MPSAFE|CTLFLAG_RW, &lodirtybuffers, __offsetof(struct bufdomain, bd_lodirtybuffers), sysctl_bufdomain_int, "I", "How many buffers we want to have free before bufdaemon can sleep"); static int hidirtybuffers; SYSCTL_PROC(_vfs, OID_AUTO, hidirtybuffers, CTLTYPE_INT|CTLFLAG_MPSAFE|CTLFLAG_RW, &hidirtybuffers, __offsetof(struct bufdomain, bd_hidirtybuffers), sysctl_bufdomain_int, "I", "When the number of dirty buffers is considered severe"); int dirtybufthresh; SYSCTL_PROC(_vfs, OID_AUTO, dirtybufthresh, CTLTYPE_INT|CTLFLAG_MPSAFE|CTLFLAG_RW, &dirtybufthresh, __offsetof(struct bufdomain, bd_dirtybufthresh), sysctl_bufdomain_int, "I", "Number of bdwrite to bawrite conversions to clear dirty buffers"); static int numfreebuffers; SYSCTL_INT(_vfs, OID_AUTO, numfreebuffers, CTLFLAG_RD, &numfreebuffers, 0, "Number of free buffers"); static int lofreebuffers; SYSCTL_PROC(_vfs, OID_AUTO, lofreebuffers, CTLTYPE_INT|CTLFLAG_MPSAFE|CTLFLAG_RW, &lofreebuffers, __offsetof(struct bufdomain, bd_lofreebuffers), sysctl_bufdomain_int, "I", "Target number of free buffers"); static int hifreebuffers; SYSCTL_PROC(_vfs, OID_AUTO, hifreebuffers, CTLTYPE_INT|CTLFLAG_MPSAFE|CTLFLAG_RW, &hifreebuffers, __offsetof(struct bufdomain, bd_hifreebuffers), sysctl_bufdomain_int, "I", "Threshold for clean buffer recycling"); static counter_u64_t getnewbufcalls; SYSCTL_COUNTER_U64(_vfs, OID_AUTO, getnewbufcalls, CTLFLAG_RD, &getnewbufcalls, "Number of calls to getnewbuf"); static counter_u64_t getnewbufrestarts; SYSCTL_COUNTER_U64(_vfs, OID_AUTO, getnewbufrestarts, CTLFLAG_RD, &getnewbufrestarts, "Number of times getnewbuf has had to restart a buffer acquisition"); static counter_u64_t mappingrestarts; SYSCTL_COUNTER_U64(_vfs, OID_AUTO, mappingrestarts, CTLFLAG_RD, &mappingrestarts, "Number of times getblk has had to restart a buffer mapping for " "unmapped buffer"); static counter_u64_t numbufallocfails; SYSCTL_COUNTER_U64(_vfs, OID_AUTO, numbufallocfails, CTLFLAG_RW, &numbufallocfails, "Number of times buffer allocations failed"); static int flushbufqtarget = 100; SYSCTL_INT(_vfs, OID_AUTO, flushbufqtarget, CTLFLAG_RW, &flushbufqtarget, 0, "Amount of work to do in flushbufqueues when helping bufdaemon"); static counter_u64_t notbufdflushes; SYSCTL_COUNTER_U64(_vfs, OID_AUTO, notbufdflushes, CTLFLAG_RD, ¬bufdflushes, "Number of dirty buffer flushes done by the bufdaemon helpers"); static long barrierwrites; SYSCTL_LONG(_vfs, OID_AUTO, barrierwrites, CTLFLAG_RW | CTLFLAG_STATS, &barrierwrites, 0, "Number of barrier writes"); SYSCTL_INT(_vfs, OID_AUTO, unmapped_buf_allowed, CTLFLAG_RD, &unmapped_buf_allowed, 0, "Permit the use of the unmapped i/o"); int maxbcachebuf = MAXBCACHEBUF; SYSCTL_INT(_vfs, OID_AUTO, maxbcachebuf, CTLFLAG_RDTUN, &maxbcachebuf, 0, "Maximum size of a buffer cache block"); /* * This lock synchronizes access to bd_request. */ static struct mtx_padalign __exclusive_cache_line bdlock; /* * This lock protects the runningbufreq and synchronizes runningbufwakeup and * waitrunningbufspace(). */ static struct mtx_padalign __exclusive_cache_line rbreqlock; /* * Lock that protects bdirtywait. */ static struct mtx_padalign __exclusive_cache_line bdirtylock; /* * Wakeup point for bufdaemon, as well as indicator of whether it is already * active. Set to 1 when the bufdaemon is already "on" the queue, 0 when it * is idling. */ static int bd_request; /* * Request for the buf daemon to write more buffers than is indicated by * lodirtybuf. This may be necessary to push out excess dependencies or * defragment the address space where a simple count of the number of dirty * buffers is insufficient to characterize the demand for flushing them. */ static int bd_speedupreq; /* * Synchronization (sleep/wakeup) variable for active buffer space requests. * Set when wait starts, cleared prior to wakeup(). * Used in runningbufwakeup() and waitrunningbufspace(). */ static int runningbufreq; /* * Synchronization for bwillwrite() waiters. */ static int bdirtywait; /* * Definitions for the buffer free lists. */ #define QUEUE_NONE 0 /* on no queue */ #define QUEUE_EMPTY 1 /* empty buffer headers */ #define QUEUE_DIRTY 2 /* B_DELWRI buffers */ #define QUEUE_CLEAN 3 /* non-B_DELWRI buffers */ #define QUEUE_SENTINEL 4 /* not an queue index, but mark for sentinel */ /* Maximum number of buffer domains. */ #define BUF_DOMAINS 8 struct bufdomainset bdlodirty; /* Domains > lodirty */ struct bufdomainset bdhidirty; /* Domains > hidirty */ /* Configured number of clean queues. */ static int __read_mostly buf_domains; BITSET_DEFINE(bufdomainset, BUF_DOMAINS); struct bufdomain __exclusive_cache_line bdomain[BUF_DOMAINS]; struct bufqueue __exclusive_cache_line bqempty; /* * per-cpu empty buffer cache. */ uma_zone_t buf_zone; /* * Single global constant for BUF_WMESG, to avoid getting multiple references. * buf_wmesg is referred from macros. */ const char *buf_wmesg = BUF_WMESG; static int sysctl_runningspace(SYSCTL_HANDLER_ARGS) { long value; int error; value = *(long *)arg1; error = sysctl_handle_long(oidp, &value, 0, req); if (error != 0 || req->newptr == NULL) return (error); mtx_lock(&rbreqlock); if (arg1 == &hirunningspace) { if (value < lorunningspace) error = EINVAL; else hirunningspace = value; } else { KASSERT(arg1 == &lorunningspace, ("%s: unknown arg1", __func__)); if (value > hirunningspace) error = EINVAL; else lorunningspace = value; } mtx_unlock(&rbreqlock); return (error); } static int sysctl_bufdomain_int(SYSCTL_HANDLER_ARGS) { int error; int value; int i; value = *(int *)arg1; error = sysctl_handle_int(oidp, &value, 0, req); if (error != 0 || req->newptr == NULL) return (error); *(int *)arg1 = value; for (i = 0; i < buf_domains; i++) *(int *)(uintptr_t)(((uintptr_t)&bdomain[i]) + arg2) = value / buf_domains; return (error); } static int sysctl_bufdomain_long(SYSCTL_HANDLER_ARGS) { long value; int error; int i; value = *(long *)arg1; error = sysctl_handle_long(oidp, &value, 0, req); if (error != 0 || req->newptr == NULL) return (error); *(long *)arg1 = value; for (i = 0; i < buf_domains; i++) *(long *)(uintptr_t)(((uintptr_t)&bdomain[i]) + arg2) = value / buf_domains; return (error); } #if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \ defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7) static int sysctl_bufspace(SYSCTL_HANDLER_ARGS) { long lvalue; int ivalue; int i; lvalue = 0; for (i = 0; i < buf_domains; i++) lvalue += bdomain[i].bd_bufspace; if (sizeof(int) == sizeof(long) || req->oldlen >= sizeof(long)) return (sysctl_handle_long(oidp, &lvalue, 0, req)); if (lvalue > INT_MAX) /* On overflow, still write out a long to trigger ENOMEM. */ return (sysctl_handle_long(oidp, &lvalue, 0, req)); ivalue = lvalue; return (sysctl_handle_int(oidp, &ivalue, 0, req)); } #else static int sysctl_bufspace(SYSCTL_HANDLER_ARGS) { long lvalue; int i; lvalue = 0; for (i = 0; i < buf_domains; i++) lvalue += bdomain[i].bd_bufspace; return (sysctl_handle_long(oidp, &lvalue, 0, req)); } #endif static int sysctl_numdirtybuffers(SYSCTL_HANDLER_ARGS) { int value; int i; value = 0; for (i = 0; i < buf_domains; i++) value += bdomain[i].bd_numdirtybuffers; return (sysctl_handle_int(oidp, &value, 0, req)); } /* * bdirtywakeup: * * Wakeup any bwillwrite() waiters. */ static void bdirtywakeup(void) { mtx_lock(&bdirtylock); if (bdirtywait) { bdirtywait = 0; wakeup(&bdirtywait); } mtx_unlock(&bdirtylock); } /* * bd_clear: * * Clear a domain from the appropriate bitsets when dirtybuffers * is decremented. */ static void bd_clear(struct bufdomain *bd) { mtx_lock(&bdirtylock); if (bd->bd_numdirtybuffers <= bd->bd_lodirtybuffers) BIT_CLR(BUF_DOMAINS, BD_DOMAIN(bd), &bdlodirty); if (bd->bd_numdirtybuffers <= bd->bd_hidirtybuffers) BIT_CLR(BUF_DOMAINS, BD_DOMAIN(bd), &bdhidirty); mtx_unlock(&bdirtylock); } /* * bd_set: * * Set a domain in the appropriate bitsets when dirtybuffers * is incremented. */ static void bd_set(struct bufdomain *bd) { mtx_lock(&bdirtylock); if (bd->bd_numdirtybuffers > bd->bd_lodirtybuffers) BIT_SET(BUF_DOMAINS, BD_DOMAIN(bd), &bdlodirty); if (bd->bd_numdirtybuffers > bd->bd_hidirtybuffers) BIT_SET(BUF_DOMAINS, BD_DOMAIN(bd), &bdhidirty); mtx_unlock(&bdirtylock); } /* * bdirtysub: * * Decrement the numdirtybuffers count by one and wakeup any * threads blocked in bwillwrite(). */ static void bdirtysub(struct buf *bp) { struct bufdomain *bd; int num; bd = bufdomain(bp); num = atomic_fetchadd_int(&bd->bd_numdirtybuffers, -1); if (num == (bd->bd_lodirtybuffers + bd->bd_hidirtybuffers) / 2) bdirtywakeup(); if (num == bd->bd_lodirtybuffers || num == bd->bd_hidirtybuffers) bd_clear(bd); } /* * bdirtyadd: * * Increment the numdirtybuffers count by one and wakeup the buf * daemon if needed. */ static void bdirtyadd(struct buf *bp) { struct bufdomain *bd; int num; /* * Only do the wakeup once as we cross the boundary. The * buf daemon will keep running until the condition clears. */ bd = bufdomain(bp); num = atomic_fetchadd_int(&bd->bd_numdirtybuffers, 1); if (num == (bd->bd_lodirtybuffers + bd->bd_hidirtybuffers) / 2) bd_wakeup(); if (num == bd->bd_lodirtybuffers || num == bd->bd_hidirtybuffers) bd_set(bd); } /* * bufspace_daemon_wakeup: * * Wakeup the daemons responsible for freeing clean bufs. */ static void bufspace_daemon_wakeup(struct bufdomain *bd) { /* * avoid the lock if the daemon is running. */ if (atomic_fetchadd_int(&bd->bd_running, 1) == 0) { BD_RUN_LOCK(bd); atomic_store_int(&bd->bd_running, 1); wakeup(&bd->bd_running); BD_RUN_UNLOCK(bd); } } /* * bufspace_daemon_wait: * * Sleep until the domain falls below a limit or one second passes. */ static void bufspace_daemon_wait(struct bufdomain *bd) { /* * Re-check our limits and sleep. bd_running must be * cleared prior to checking the limits to avoid missed * wakeups. The waker will adjust one of bufspace or * freebuffers prior to checking bd_running. */ BD_RUN_LOCK(bd); atomic_store_int(&bd->bd_running, 0); if (bd->bd_bufspace < bd->bd_bufspacethresh && bd->bd_freebuffers > bd->bd_lofreebuffers) { msleep(&bd->bd_running, BD_RUN_LOCKPTR(bd), PRIBIO|PDROP, "-", hz); } else { /* Avoid spurious wakeups while running. */ atomic_store_int(&bd->bd_running, 1); BD_RUN_UNLOCK(bd); } } /* * bufspace_adjust: * * Adjust the reported bufspace for a KVA managed buffer, possibly * waking any waiters. */ static void bufspace_adjust(struct buf *bp, int bufsize) { struct bufdomain *bd; long space; int diff; KASSERT((bp->b_flags & B_MALLOC) == 0, ("bufspace_adjust: malloc buf %p", bp)); bd = bufdomain(bp); diff = bufsize - bp->b_bufsize; if (diff < 0) { atomic_subtract_long(&bd->bd_bufspace, -diff); } else if (diff > 0) { space = atomic_fetchadd_long(&bd->bd_bufspace, diff); /* Wake up the daemon on the transition. */ if (space < bd->bd_bufspacethresh && space + diff >= bd->bd_bufspacethresh) bufspace_daemon_wakeup(bd); } bp->b_bufsize = bufsize; } /* * bufspace_reserve: * * Reserve bufspace before calling allocbuf(). metadata has a * different space limit than data. */ static int bufspace_reserve(struct bufdomain *bd, int size, bool metadata) { long limit, new; long space; if (metadata) limit = bd->bd_maxbufspace; else limit = bd->bd_hibufspace; space = atomic_fetchadd_long(&bd->bd_bufspace, size); new = space + size; if (new > limit) { atomic_subtract_long(&bd->bd_bufspace, size); return (ENOSPC); } /* Wake up the daemon on the transition. */ if (space < bd->bd_bufspacethresh && new >= bd->bd_bufspacethresh) bufspace_daemon_wakeup(bd); return (0); } /* * bufspace_release: * * Release reserved bufspace after bufspace_adjust() has consumed it. */ static void bufspace_release(struct bufdomain *bd, int size) { atomic_subtract_long(&bd->bd_bufspace, size); } /* * bufspace_wait: * * Wait for bufspace, acting as the buf daemon if a locked vnode is * supplied. bd_wanted must be set prior to polling for space. The * operation must be re-tried on return. */ static void bufspace_wait(struct bufdomain *bd, struct vnode *vp, int gbflags, int slpflag, int slptimeo) { struct thread *td; int error, fl, norunbuf; if ((gbflags & GB_NOWAIT_BD) != 0) return; td = curthread; BD_LOCK(bd); while (bd->bd_wanted) { if (vp != NULL && vp->v_type != VCHR && (td->td_pflags & TDP_BUFNEED) == 0) { BD_UNLOCK(bd); /* * getblk() is called with a vnode locked, and * some majority of the dirty buffers may as * well belong to the vnode. Flushing the * buffers there would make a progress that * cannot be achieved by the buf_daemon, that * cannot lock the vnode. */ norunbuf = ~(TDP_BUFNEED | TDP_NORUNNINGBUF) | (td->td_pflags & TDP_NORUNNINGBUF); /* * Play bufdaemon. The getnewbuf() function * may be called while the thread owns lock * for another dirty buffer for the same * vnode, which makes it impossible to use * VOP_FSYNC() there, due to the buffer lock * recursion. */ td->td_pflags |= TDP_BUFNEED | TDP_NORUNNINGBUF; fl = buf_flush(vp, bd, flushbufqtarget); td->td_pflags &= norunbuf; BD_LOCK(bd); if (fl != 0) continue; if (bd->bd_wanted == 0) break; } error = msleep(&bd->bd_wanted, BD_LOCKPTR(bd), (PRIBIO + 4) | slpflag, "newbuf", slptimeo); if (error != 0) break; } BD_UNLOCK(bd); } /* * bufspace_daemon: * * buffer space management daemon. Tries to maintain some marginal * amount of free buffer space so that requesting processes neither * block nor work to reclaim buffers. */ static void bufspace_daemon(void *arg) { struct bufdomain *bd; EVENTHANDLER_REGISTER(shutdown_pre_sync, kthread_shutdown, curthread, SHUTDOWN_PRI_LAST + 100); bd = arg; for (;;) { kthread_suspend_check(); /* * Free buffers from the clean queue until we meet our * targets. * * Theory of operation: The buffer cache is most efficient * when some free buffer headers and space are always * available to getnewbuf(). This daemon attempts to prevent * the excessive blocking and synchronization associated * with shortfall. It goes through three phases according * demand: * * 1) The daemon wakes up voluntarily once per-second * during idle periods when the counters are below * the wakeup thresholds (bufspacethresh, lofreebuffers). * * 2) The daemon wakes up as we cross the thresholds * ahead of any potential blocking. This may bounce * slightly according to the rate of consumption and * release. * * 3) The daemon and consumers are starved for working * clean buffers. This is the 'bufspace' sleep below * which will inefficiently trade bufs with bqrelse * until we return to condition 2. */ while (bd->bd_bufspace > bd->bd_lobufspace || bd->bd_freebuffers < bd->bd_hifreebuffers) { if (buf_recycle(bd, false) != 0) { if (bd_flushall(bd)) continue; /* * Speedup dirty if we've run out of clean * buffers. This is possible in particular * because softdep may held many bufs locked * pending writes to other bufs which are * marked for delayed write, exhausting * clean space until they are written. */ bd_speedup(); BD_LOCK(bd); if (bd->bd_wanted) { msleep(&bd->bd_wanted, BD_LOCKPTR(bd), PRIBIO|PDROP, "bufspace", hz/10); } else BD_UNLOCK(bd); } maybe_yield(); } bufspace_daemon_wait(bd); } } /* * bufmallocadjust: * * Adjust the reported bufspace for a malloc managed buffer, possibly * waking any waiters. */ static void bufmallocadjust(struct buf *bp, int bufsize) { int diff; KASSERT((bp->b_flags & B_MALLOC) != 0, ("bufmallocadjust: non-malloc buf %p", bp)); diff = bufsize - bp->b_bufsize; if (diff < 0) atomic_subtract_long(&bufmallocspace, -diff); else atomic_add_long(&bufmallocspace, diff); bp->b_bufsize = bufsize; } /* * runningwakeup: * * Wake up processes that are waiting on asynchronous writes to fall * below lorunningspace. */ static void runningwakeup(void) { mtx_lock(&rbreqlock); if (runningbufreq) { runningbufreq = 0; wakeup(&runningbufreq); } mtx_unlock(&rbreqlock); } /* * runningbufwakeup: * * Decrement the outstanding write count according. */ void runningbufwakeup(struct buf *bp) { long space, bspace; bspace = bp->b_runningbufspace; if (bspace == 0) return; space = atomic_fetchadd_long(&runningbufspace, -bspace); KASSERT(space >= bspace, ("runningbufspace underflow %ld %ld", space, bspace)); bp->b_runningbufspace = 0; /* * Only acquire the lock and wakeup on the transition from exceeding * the threshold to falling below it. */ if (space < lorunningspace) return; if (space - bspace > lorunningspace) return; runningwakeup(); } /* * waitrunningbufspace() * * runningbufspace is a measure of the amount of I/O currently * running. This routine is used in async-write situations to * prevent creating huge backups of pending writes to a device. * Only asynchronous writes are governed by this function. * * This does NOT turn an async write into a sync write. It waits * for earlier writes to complete and generally returns before the * caller's write has reached the device. */ void waitrunningbufspace(void) { mtx_lock(&rbreqlock); while (runningbufspace > hirunningspace) { runningbufreq = 1; msleep(&runningbufreq, &rbreqlock, PVM, "wdrain", 0); } mtx_unlock(&rbreqlock); } /* * vfs_buf_test_cache: * * Called when a buffer is extended. This function clears the B_CACHE * bit if the newly extended portion of the buffer does not contain * valid data. */ static __inline void vfs_buf_test_cache(struct buf *bp, vm_ooffset_t foff, vm_offset_t off, vm_offset_t size, vm_page_t m) { /* * This function and its results are protected by higher level * synchronization requiring vnode and buf locks to page in and * validate pages. */ if (bp->b_flags & B_CACHE) { int base = (foff + off) & PAGE_MASK; if (vm_page_is_valid(m, base, size) == 0) bp->b_flags &= ~B_CACHE; } } /* Wake up the buffer daemon if necessary */ static void bd_wakeup(void) { mtx_lock(&bdlock); if (bd_request == 0) { bd_request = 1; wakeup(&bd_request); } mtx_unlock(&bdlock); } /* * Adjust the maxbcachbuf tunable. */ static void maxbcachebuf_adjust(void) { int i; /* * maxbcachebuf must be a power of 2 >= MAXBSIZE. */ i = 2; while (i * 2 <= maxbcachebuf) i *= 2; maxbcachebuf = i; if (maxbcachebuf < MAXBSIZE) maxbcachebuf = MAXBSIZE; if (maxbcachebuf > MAXPHYS) maxbcachebuf = MAXPHYS; if (bootverbose != 0 && maxbcachebuf != MAXBCACHEBUF) printf("maxbcachebuf=%d\n", maxbcachebuf); } /* * bd_speedup - speedup the buffer cache flushing code */ void bd_speedup(void) { int needwake; mtx_lock(&bdlock); needwake = 0; if (bd_speedupreq == 0 || bd_request == 0) needwake = 1; bd_speedupreq = 1; bd_request = 1; if (needwake) wakeup(&bd_request); mtx_unlock(&bdlock); } #ifdef __i386__ #define TRANSIENT_DENOM 5 #else #define TRANSIENT_DENOM 10 #endif /* * Calculating buffer cache scaling values and reserve space for buffer * headers. This is called during low level kernel initialization and * may be called more then once. We CANNOT write to the memory area * being reserved at this time. */ caddr_t kern_vfs_bio_buffer_alloc(caddr_t v, long physmem_est) { int tuned_nbuf; long maxbuf, maxbuf_sz, buf_sz, biotmap_sz; /* * physmem_est is in pages. Convert it to kilobytes (assumes * PAGE_SIZE is >= 1K) */ physmem_est = physmem_est * (PAGE_SIZE / 1024); maxbcachebuf_adjust(); /* * The nominal buffer size (and minimum KVA allocation) is BKVASIZE. * For the first 64MB of ram nominally allocate sufficient buffers to * cover 1/4 of our ram. Beyond the first 64MB allocate additional * buffers to cover 1/10 of our ram over 64MB. When auto-sizing * the buffer cache we limit the eventual kva reservation to * maxbcache bytes. * * factor represents the 1/4 x ram conversion. */ if (nbuf == 0) { int factor = 4 * BKVASIZE / 1024; nbuf = 50; if (physmem_est > 4096) nbuf += min((physmem_est - 4096) / factor, 65536 / factor); if (physmem_est > 65536) nbuf += min((physmem_est - 65536) * 2 / (factor * 5), 32 * 1024 * 1024 / (factor * 5)); if (maxbcache && nbuf > maxbcache / BKVASIZE) nbuf = maxbcache / BKVASIZE; tuned_nbuf = 1; } else tuned_nbuf = 0; /* XXX Avoid unsigned long overflows later on with maxbufspace. */ maxbuf = (LONG_MAX / 3) / BKVASIZE; if (nbuf > maxbuf) { if (!tuned_nbuf) printf("Warning: nbufs lowered from %d to %ld\n", nbuf, maxbuf); nbuf = maxbuf; } /* * Ideal allocation size for the transient bio submap is 10% * of the maximal space buffer map. This roughly corresponds * to the amount of the buffer mapped for typical UFS load. * * Clip the buffer map to reserve space for the transient * BIOs, if its extent is bigger than 90% (80% on i386) of the * maximum buffer map extent on the platform. * * The fall-back to the maxbuf in case of maxbcache unset, * allows to not trim the buffer KVA for the architectures * with ample KVA space. */ if (bio_transient_maxcnt == 0 && unmapped_buf_allowed) { maxbuf_sz = maxbcache != 0 ? maxbcache : maxbuf * BKVASIZE; buf_sz = (long)nbuf * BKVASIZE; if (buf_sz < maxbuf_sz / TRANSIENT_DENOM * (TRANSIENT_DENOM - 1)) { /* * There is more KVA than memory. Do not * adjust buffer map size, and assign the rest * of maxbuf to transient map. */ biotmap_sz = maxbuf_sz - buf_sz; } else { /* * Buffer map spans all KVA we could afford on * this platform. Give 10% (20% on i386) of * the buffer map to the transient bio map. */ biotmap_sz = buf_sz / TRANSIENT_DENOM; buf_sz -= biotmap_sz; } if (biotmap_sz / INT_MAX > MAXPHYS) bio_transient_maxcnt = INT_MAX; else bio_transient_maxcnt = biotmap_sz / MAXPHYS; /* * Artificially limit to 1024 simultaneous in-flight I/Os * using the transient mapping. */ if (bio_transient_maxcnt > 1024) bio_transient_maxcnt = 1024; if (tuned_nbuf) nbuf = buf_sz / BKVASIZE; } if (nswbuf == 0) { nswbuf = min(nbuf / 4, 256); if (nswbuf < NSWBUF_MIN) nswbuf = NSWBUF_MIN; } /* * Reserve space for the buffer cache buffers */ buf = (void *)v; v = (caddr_t)(buf + nbuf); return(v); } /* Initialize the buffer subsystem. Called before use of any buffers. */ void bufinit(void) { struct buf *bp; int i; KASSERT(maxbcachebuf >= MAXBSIZE, ("maxbcachebuf (%d) must be >= MAXBSIZE (%d)\n", maxbcachebuf, MAXBSIZE)); bq_init(&bqempty, QUEUE_EMPTY, -1, "bufq empty lock"); mtx_init(&rbreqlock, "runningbufspace lock", NULL, MTX_DEF); mtx_init(&bdlock, "buffer daemon lock", NULL, MTX_DEF); mtx_init(&bdirtylock, "dirty buf lock", NULL, MTX_DEF); unmapped_buf = (caddr_t)kva_alloc(MAXPHYS); /* finally, initialize each buffer header and stick on empty q */ for (i = 0; i < nbuf; i++) { bp = &buf[i]; bzero(bp, sizeof *bp); bp->b_flags = B_INVAL; bp->b_rcred = NOCRED; bp->b_wcred = NOCRED; bp->b_qindex = QUEUE_NONE; bp->b_domain = -1; bp->b_subqueue = mp_maxid + 1; bp->b_xflags = 0; bp->b_data = bp->b_kvabase = unmapped_buf; LIST_INIT(&bp->b_dep); BUF_LOCKINIT(bp); bq_insert(&bqempty, bp, false); } /* * maxbufspace is the absolute maximum amount of buffer space we are * allowed to reserve in KVM and in real terms. The absolute maximum * is nominally used by metadata. hibufspace is the nominal maximum * used by most other requests. The differential is required to * ensure that metadata deadlocks don't occur. * * maxbufspace is based on BKVASIZE. Allocating buffers larger then * this may result in KVM fragmentation which is not handled optimally * by the system. XXX This is less true with vmem. We could use * PAGE_SIZE. */ maxbufspace = (long)nbuf * BKVASIZE; hibufspace = lmax(3 * maxbufspace / 4, maxbufspace - maxbcachebuf * 10); lobufspace = (hibufspace / 20) * 19; /* 95% */ bufspacethresh = lobufspace + (hibufspace - lobufspace) / 2; /* * Note: The 16 MiB upper limit for hirunningspace was chosen * arbitrarily and may need further tuning. It corresponds to * 128 outstanding write IO requests (if IO size is 128 KiB), * which fits with many RAID controllers' tagged queuing limits. * The lower 1 MiB limit is the historical upper limit for * hirunningspace. */ hirunningspace = lmax(lmin(roundup(hibufspace / 64, maxbcachebuf), 16 * 1024 * 1024), 1024 * 1024); lorunningspace = roundup((hirunningspace * 2) / 3, maxbcachebuf); /* * Limit the amount of malloc memory since it is wired permanently into * the kernel space. Even though this is accounted for in the buffer * allocation, we don't want the malloced region to grow uncontrolled. * The malloc scheme improves memory utilization significantly on * average (small) directories. */ maxbufmallocspace = hibufspace / 20; /* * Reduce the chance of a deadlock occurring by limiting the number * of delayed-write dirty buffers we allow to stack up. */ hidirtybuffers = nbuf / 4 + 20; dirtybufthresh = hidirtybuffers * 9 / 10; /* * To support extreme low-memory systems, make sure hidirtybuffers * cannot eat up all available buffer space. This occurs when our * minimum cannot be met. We try to size hidirtybuffers to 3/4 our * buffer space assuming BKVASIZE'd buffers. */ while ((long)hidirtybuffers * BKVASIZE > 3 * hibufspace / 4) { hidirtybuffers >>= 1; } lodirtybuffers = hidirtybuffers / 2; /* * lofreebuffers should be sufficient to avoid stalling waiting on * buf headers under heavy utilization. The bufs in per-cpu caches * are counted as free but will be unavailable to threads executing * on other cpus. * * hifreebuffers is the free target for the bufspace daemon. This * should be set appropriately to limit work per-iteration. */ lofreebuffers = MIN((nbuf / 25) + (20 * mp_ncpus), 128 * mp_ncpus); hifreebuffers = (3 * lofreebuffers) / 2; numfreebuffers = nbuf; /* Setup the kva and free list allocators. */ vmem_set_reclaim(buffer_arena, bufkva_reclaim); buf_zone = uma_zcache_create("buf free cache", sizeof(struct buf), NULL, NULL, NULL, NULL, buf_import, buf_release, NULL, 0); /* * Size the clean queue according to the amount of buffer space. * One queue per-256mb up to the max. More queues gives better * concurrency but less accurate LRU. */ buf_domains = MIN(howmany(maxbufspace, 256*1024*1024), BUF_DOMAINS); for (i = 0 ; i < buf_domains; i++) { struct bufdomain *bd; bd = &bdomain[i]; bd_init(bd); bd->bd_freebuffers = nbuf / buf_domains; bd->bd_hifreebuffers = hifreebuffers / buf_domains; bd->bd_lofreebuffers = lofreebuffers / buf_domains; bd->bd_bufspace = 0; bd->bd_maxbufspace = maxbufspace / buf_domains; bd->bd_hibufspace = hibufspace / buf_domains; bd->bd_lobufspace = lobufspace / buf_domains; bd->bd_bufspacethresh = bufspacethresh / buf_domains; bd->bd_numdirtybuffers = 0; bd->bd_hidirtybuffers = hidirtybuffers / buf_domains; bd->bd_lodirtybuffers = lodirtybuffers / buf_domains; bd->bd_dirtybufthresh = dirtybufthresh / buf_domains; /* Don't allow more than 2% of bufs in the per-cpu caches. */ bd->bd_lim = nbuf / buf_domains / 50 / mp_ncpus; } getnewbufcalls = counter_u64_alloc(M_WAITOK); getnewbufrestarts = counter_u64_alloc(M_WAITOK); mappingrestarts = counter_u64_alloc(M_WAITOK); numbufallocfails = counter_u64_alloc(M_WAITOK); notbufdflushes = counter_u64_alloc(M_WAITOK); buffreekvacnt = counter_u64_alloc(M_WAITOK); bufdefragcnt = counter_u64_alloc(M_WAITOK); bufkvaspace = counter_u64_alloc(M_WAITOK); } #ifdef INVARIANTS static inline void vfs_buf_check_mapped(struct buf *bp) { KASSERT(bp->b_kvabase != unmapped_buf, ("mapped buf: b_kvabase was not updated %p", bp)); KASSERT(bp->b_data != unmapped_buf, ("mapped buf: b_data was not updated %p", bp)); KASSERT(bp->b_data < unmapped_buf || bp->b_data >= unmapped_buf + MAXPHYS, ("b_data + b_offset unmapped %p", bp)); } static inline void vfs_buf_check_unmapped(struct buf *bp) { KASSERT(bp->b_data == unmapped_buf, ("unmapped buf: corrupted b_data %p", bp)); } #define BUF_CHECK_MAPPED(bp) vfs_buf_check_mapped(bp) #define BUF_CHECK_UNMAPPED(bp) vfs_buf_check_unmapped(bp) #else #define BUF_CHECK_MAPPED(bp) do {} while (0) #define BUF_CHECK_UNMAPPED(bp) do {} while (0) #endif static int isbufbusy(struct buf *bp) { if (((bp->b_flags & B_INVAL) == 0 && BUF_ISLOCKED(bp)) || ((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI)) return (1); return (0); } /* * Shutdown the system cleanly to prepare for reboot, halt, or power off. */ void bufshutdown(int show_busybufs) { static int first_buf_printf = 1; struct buf *bp; int iter, nbusy, pbusy; #ifndef PREEMPTION int subiter; #endif /* * Sync filesystems for shutdown */ wdog_kern_pat(WD_LASTVAL); kern_sync(curthread); /* * With soft updates, some buffers that are * written will be remarked as dirty until other * buffers are written. */ for (iter = pbusy = 0; iter < 20; iter++) { nbusy = 0; for (bp = &buf[nbuf]; --bp >= buf; ) if (isbufbusy(bp)) nbusy++; if (nbusy == 0) { if (first_buf_printf) printf("All buffers synced."); break; } if (first_buf_printf) { printf("Syncing disks, buffers remaining... "); first_buf_printf = 0; } printf("%d ", nbusy); if (nbusy < pbusy) iter = 0; pbusy = nbusy; wdog_kern_pat(WD_LASTVAL); kern_sync(curthread); #ifdef PREEMPTION /* * Spin for a while to allow interrupt threads to run. */ DELAY(50000 * iter); #else /* * Context switch several times to allow interrupt * threads to run. */ for (subiter = 0; subiter < 50 * iter; subiter++) { thread_lock(curthread); mi_switch(SW_VOL); DELAY(1000); } #endif } printf("\n"); /* * Count only busy local buffers to prevent forcing * a fsck if we're just a client of a wedged NFS server */ nbusy = 0; for (bp = &buf[nbuf]; --bp >= buf; ) { if (isbufbusy(bp)) { #if 0 /* XXX: This is bogus. We should probably have a BO_REMOTE flag instead */ if (bp->b_dev == NULL) { TAILQ_REMOVE(&mountlist, bp->b_vp->v_mount, mnt_list); continue; } #endif nbusy++; if (show_busybufs > 0) { printf( "%d: buf:%p, vnode:%p, flags:%0x, blkno:%jd, lblkno:%jd, buflock:", nbusy, bp, bp->b_vp, bp->b_flags, (intmax_t)bp->b_blkno, (intmax_t)bp->b_lblkno); BUF_LOCKPRINTINFO(bp); if (show_busybufs > 1) vn_printf(bp->b_vp, "vnode content: "); } } } if (nbusy) { /* * Failed to sync all blocks. Indicate this and don't * unmount filesystems (thus forcing an fsck on reboot). */ printf("Giving up on %d buffers\n", nbusy); DELAY(5000000); /* 5 seconds */ } else { if (!first_buf_printf) printf("Final sync complete\n"); /* * Unmount filesystems */ if (!KERNEL_PANICKED()) vfs_unmountall(); } swapoff_all(); DELAY(100000); /* wait for console output to finish */ } static void bpmap_qenter(struct buf *bp) { BUF_CHECK_MAPPED(bp); /* * bp->b_data is relative to bp->b_offset, but * bp->b_offset may be offset into the first page. */ bp->b_data = (caddr_t)trunc_page((vm_offset_t)bp->b_data); pmap_qenter((vm_offset_t)bp->b_data, bp->b_pages, bp->b_npages); bp->b_data = (caddr_t)((vm_offset_t)bp->b_data | (vm_offset_t)(bp->b_offset & PAGE_MASK)); } static inline struct bufdomain * bufdomain(struct buf *bp) { return (&bdomain[bp->b_domain]); } static struct bufqueue * bufqueue(struct buf *bp) { switch (bp->b_qindex) { case QUEUE_NONE: /* FALLTHROUGH */ case QUEUE_SENTINEL: return (NULL); case QUEUE_EMPTY: return (&bqempty); case QUEUE_DIRTY: return (&bufdomain(bp)->bd_dirtyq); case QUEUE_CLEAN: return (&bufdomain(bp)->bd_subq[bp->b_subqueue]); default: break; } panic("bufqueue(%p): Unhandled type %d\n", bp, bp->b_qindex); } /* * Return the locked bufqueue that bp is a member of. */ static struct bufqueue * bufqueue_acquire(struct buf *bp) { struct bufqueue *bq, *nbq; /* * bp can be pushed from a per-cpu queue to the * cleanq while we're waiting on the lock. Retry * if the queues don't match. */ bq = bufqueue(bp); BQ_LOCK(bq); for (;;) { nbq = bufqueue(bp); if (bq == nbq) break; BQ_UNLOCK(bq); BQ_LOCK(nbq); bq = nbq; } return (bq); } /* * binsfree: * * Insert the buffer into the appropriate free list. Requires a * locked buffer on entry and buffer is unlocked before return. */ static void binsfree(struct buf *bp, int qindex) { struct bufdomain *bd; struct bufqueue *bq; KASSERT(qindex == QUEUE_CLEAN || qindex == QUEUE_DIRTY, ("binsfree: Invalid qindex %d", qindex)); BUF_ASSERT_XLOCKED(bp); /* * Handle delayed bremfree() processing. */ if (bp->b_flags & B_REMFREE) { if (bp->b_qindex == qindex) { bp->b_flags |= B_REUSE; bp->b_flags &= ~B_REMFREE; BUF_UNLOCK(bp); return; } bq = bufqueue_acquire(bp); bq_remove(bq, bp); BQ_UNLOCK(bq); } bd = bufdomain(bp); if (qindex == QUEUE_CLEAN) { if (bd->bd_lim != 0) bq = &bd->bd_subq[PCPU_GET(cpuid)]; else bq = bd->bd_cleanq; } else bq = &bd->bd_dirtyq; bq_insert(bq, bp, true); } /* * buf_free: * * Free a buffer to the buf zone once it no longer has valid contents. */ static void buf_free(struct buf *bp) { if (bp->b_flags & B_REMFREE) bremfreef(bp); if (bp->b_vflags & BV_BKGRDINPROG) panic("losing buffer 1"); if (bp->b_rcred != NOCRED) { crfree(bp->b_rcred); bp->b_rcred = NOCRED; } if (bp->b_wcred != NOCRED) { crfree(bp->b_wcred); bp->b_wcred = NOCRED; } if (!LIST_EMPTY(&bp->b_dep)) buf_deallocate(bp); bufkva_free(bp); atomic_add_int(&bufdomain(bp)->bd_freebuffers, 1); BUF_UNLOCK(bp); uma_zfree(buf_zone, bp); } /* * buf_import: * * Import bufs into the uma cache from the buf list. The system still * expects a static array of bufs and much of the synchronization * around bufs assumes type stable storage. As a result, UMA is used * only as a per-cpu cache of bufs still maintained on a global list. */ static int buf_import(void *arg, void **store, int cnt, int domain, int flags) { struct buf *bp; int i; BQ_LOCK(&bqempty); for (i = 0; i < cnt; i++) { bp = TAILQ_FIRST(&bqempty.bq_queue); if (bp == NULL) break; bq_remove(&bqempty, bp); store[i] = bp; } BQ_UNLOCK(&bqempty); return (i); } /* * buf_release: * * Release bufs from the uma cache back to the buffer queues. */ static void buf_release(void *arg, void **store, int cnt) { struct bufqueue *bq; struct buf *bp; int i; bq = &bqempty; BQ_LOCK(bq); for (i = 0; i < cnt; i++) { bp = store[i]; /* Inline bq_insert() to batch locking. */ TAILQ_INSERT_TAIL(&bq->bq_queue, bp, b_freelist); bp->b_flags &= ~(B_AGE | B_REUSE); bq->bq_len++; bp->b_qindex = bq->bq_index; } BQ_UNLOCK(bq); } /* * buf_alloc: * * Allocate an empty buffer header. */ static struct buf * buf_alloc(struct bufdomain *bd) { struct buf *bp; int freebufs, error; /* * We can only run out of bufs in the buf zone if the average buf * is less than BKVASIZE. In this case the actual wait/block will * come from buf_reycle() failing to flush one of these small bufs. */ bp = NULL; freebufs = atomic_fetchadd_int(&bd->bd_freebuffers, -1); if (freebufs > 0) bp = uma_zalloc(buf_zone, M_NOWAIT); if (bp == NULL) { atomic_add_int(&bd->bd_freebuffers, 1); bufspace_daemon_wakeup(bd); counter_u64_add(numbufallocfails, 1); return (NULL); } /* * Wake-up the bufspace daemon on transition below threshold. */ if (freebufs == bd->bd_lofreebuffers) bufspace_daemon_wakeup(bd); error = BUF_LOCK(bp, LK_EXCLUSIVE, NULL); KASSERT(error == 0, ("%s: BUF_LOCK on free buf %p: %d.", __func__, bp, error)); (void)error; KASSERT(bp->b_vp == NULL, ("bp: %p still has vnode %p.", bp, bp->b_vp)); KASSERT((bp->b_flags & (B_DELWRI | B_NOREUSE)) == 0, ("invalid buffer %p flags %#x", bp, bp->b_flags)); KASSERT((bp->b_xflags & (BX_VNCLEAN|BX_VNDIRTY)) == 0, ("bp: %p still on a buffer list. xflags %X", bp, bp->b_xflags)); KASSERT(bp->b_npages == 0, ("bp: %p still has %d vm pages\n", bp, bp->b_npages)); KASSERT(bp->b_kvasize == 0, ("bp: %p still has kva\n", bp)); KASSERT(bp->b_bufsize == 0, ("bp: %p still has bufspace\n", bp)); bp->b_domain = BD_DOMAIN(bd); bp->b_flags = 0; bp->b_ioflags = 0; bp->b_xflags = 0; bp->b_vflags = 0; bp->b_vp = NULL; bp->b_blkno = bp->b_lblkno = 0; bp->b_offset = NOOFFSET; bp->b_iodone = 0; bp->b_error = 0; bp->b_resid = 0; bp->b_bcount = 0; bp->b_npages = 0; bp->b_dirtyoff = bp->b_dirtyend = 0; bp->b_bufobj = NULL; bp->b_data = bp->b_kvabase = unmapped_buf; bp->b_fsprivate1 = NULL; bp->b_fsprivate2 = NULL; bp->b_fsprivate3 = NULL; LIST_INIT(&bp->b_dep); return (bp); } /* * buf_recycle: * * Free a buffer from the given bufqueue. kva controls whether the * freed buf must own some kva resources. This is used for * defragmenting. */ static int buf_recycle(struct bufdomain *bd, bool kva) { struct bufqueue *bq; struct buf *bp, *nbp; if (kva) counter_u64_add(bufdefragcnt, 1); nbp = NULL; bq = bd->bd_cleanq; BQ_LOCK(bq); KASSERT(BQ_LOCKPTR(bq) == BD_LOCKPTR(bd), ("buf_recycle: Locks don't match")); nbp = TAILQ_FIRST(&bq->bq_queue); /* * Run scan, possibly freeing data and/or kva mappings on the fly * depending. */ while ((bp = nbp) != NULL) { /* * Calculate next bp (we can only use it if we do not * release the bqlock). */ nbp = TAILQ_NEXT(bp, b_freelist); /* * If we are defragging then we need a buffer with * some kva to reclaim. */ if (kva && bp->b_kvasize == 0) continue; if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0) continue; /* * Implement a second chance algorithm for frequently * accessed buffers. */ if ((bp->b_flags & B_REUSE) != 0) { TAILQ_REMOVE(&bq->bq_queue, bp, b_freelist); TAILQ_INSERT_TAIL(&bq->bq_queue, bp, b_freelist); bp->b_flags &= ~B_REUSE; BUF_UNLOCK(bp); continue; } /* * Skip buffers with background writes in progress. */ if ((bp->b_vflags & BV_BKGRDINPROG) != 0) { BUF_UNLOCK(bp); continue; } KASSERT(bp->b_qindex == QUEUE_CLEAN, ("buf_recycle: inconsistent queue %d bp %p", bp->b_qindex, bp)); KASSERT(bp->b_domain == BD_DOMAIN(bd), ("getnewbuf: queue domain %d doesn't match request %d", bp->b_domain, (int)BD_DOMAIN(bd))); /* * NOTE: nbp is now entirely invalid. We can only restart * the scan from this point on. */ bq_remove(bq, bp); BQ_UNLOCK(bq); /* * Requeue the background write buffer with error and * restart the scan. */ if ((bp->b_vflags & BV_BKGRDERR) != 0) { bqrelse(bp); BQ_LOCK(bq); nbp = TAILQ_FIRST(&bq->bq_queue); continue; } bp->b_flags |= B_INVAL; brelse(bp); return (0); } bd->bd_wanted = 1; BQ_UNLOCK(bq); return (ENOBUFS); } /* * bremfree: * * Mark the buffer for removal from the appropriate free list. * */ void bremfree(struct buf *bp) { CTR3(KTR_BUF, "bremfree(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); KASSERT((bp->b_flags & B_REMFREE) == 0, ("bremfree: buffer %p already marked for delayed removal.", bp)); KASSERT(bp->b_qindex != QUEUE_NONE, ("bremfree: buffer %p not on a queue.", bp)); BUF_ASSERT_XLOCKED(bp); bp->b_flags |= B_REMFREE; } /* * bremfreef: * * Force an immediate removal from a free list. Used only in nfs when * it abuses the b_freelist pointer. */ void bremfreef(struct buf *bp) { struct bufqueue *bq; bq = bufqueue_acquire(bp); bq_remove(bq, bp); BQ_UNLOCK(bq); } static void bq_init(struct bufqueue *bq, int qindex, int subqueue, const char *lockname) { mtx_init(&bq->bq_lock, lockname, NULL, MTX_DEF); TAILQ_INIT(&bq->bq_queue); bq->bq_len = 0; bq->bq_index = qindex; bq->bq_subqueue = subqueue; } static void bd_init(struct bufdomain *bd) { int i; bd->bd_cleanq = &bd->bd_subq[mp_maxid + 1]; bq_init(bd->bd_cleanq, QUEUE_CLEAN, mp_maxid + 1, "bufq clean lock"); bq_init(&bd->bd_dirtyq, QUEUE_DIRTY, -1, "bufq dirty lock"); for (i = 0; i <= mp_maxid; i++) bq_init(&bd->bd_subq[i], QUEUE_CLEAN, i, "bufq clean subqueue lock"); mtx_init(&bd->bd_run_lock, "bufspace daemon run lock", NULL, MTX_DEF); } /* * bq_remove: * * Removes a buffer from the free list, must be called with the * correct qlock held. */ static void bq_remove(struct bufqueue *bq, struct buf *bp) { CTR3(KTR_BUF, "bq_remove(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); KASSERT(bp->b_qindex != QUEUE_NONE, ("bq_remove: buffer %p not on a queue.", bp)); KASSERT(bufqueue(bp) == bq, ("bq_remove: Remove buffer %p from wrong queue.", bp)); BQ_ASSERT_LOCKED(bq); if (bp->b_qindex != QUEUE_EMPTY) { BUF_ASSERT_XLOCKED(bp); } KASSERT(bq->bq_len >= 1, ("queue %d underflow", bp->b_qindex)); TAILQ_REMOVE(&bq->bq_queue, bp, b_freelist); bq->bq_len--; bp->b_qindex = QUEUE_NONE; bp->b_flags &= ~(B_REMFREE | B_REUSE); } static void bd_flush(struct bufdomain *bd, struct bufqueue *bq) { struct buf *bp; BQ_ASSERT_LOCKED(bq); if (bq != bd->bd_cleanq) { BD_LOCK(bd); while ((bp = TAILQ_FIRST(&bq->bq_queue)) != NULL) { TAILQ_REMOVE(&bq->bq_queue, bp, b_freelist); TAILQ_INSERT_TAIL(&bd->bd_cleanq->bq_queue, bp, b_freelist); bp->b_subqueue = bd->bd_cleanq->bq_subqueue; } bd->bd_cleanq->bq_len += bq->bq_len; bq->bq_len = 0; } if (bd->bd_wanted) { bd->bd_wanted = 0; wakeup(&bd->bd_wanted); } if (bq != bd->bd_cleanq) BD_UNLOCK(bd); } static int bd_flushall(struct bufdomain *bd) { struct bufqueue *bq; int flushed; int i; if (bd->bd_lim == 0) return (0); flushed = 0; for (i = 0; i <= mp_maxid; i++) { bq = &bd->bd_subq[i]; if (bq->bq_len == 0) continue; BQ_LOCK(bq); bd_flush(bd, bq); BQ_UNLOCK(bq); flushed++; } return (flushed); } static void bq_insert(struct bufqueue *bq, struct buf *bp, bool unlock) { struct bufdomain *bd; if (bp->b_qindex != QUEUE_NONE) panic("bq_insert: free buffer %p onto another queue?", bp); bd = bufdomain(bp); if (bp->b_flags & B_AGE) { /* Place this buf directly on the real queue. */ if (bq->bq_index == QUEUE_CLEAN) bq = bd->bd_cleanq; BQ_LOCK(bq); TAILQ_INSERT_HEAD(&bq->bq_queue, bp, b_freelist); } else { BQ_LOCK(bq); TAILQ_INSERT_TAIL(&bq->bq_queue, bp, b_freelist); } bp->b_flags &= ~(B_AGE | B_REUSE); bq->bq_len++; bp->b_qindex = bq->bq_index; bp->b_subqueue = bq->bq_subqueue; /* * Unlock before we notify so that we don't wakeup a waiter that * fails a trylock on the buf and sleeps again. */ if (unlock) BUF_UNLOCK(bp); if (bp->b_qindex == QUEUE_CLEAN) { /* * Flush the per-cpu queue and notify any waiters. */ if (bd->bd_wanted || (bq != bd->bd_cleanq && bq->bq_len >= bd->bd_lim)) bd_flush(bd, bq); } BQ_UNLOCK(bq); } /* * bufkva_free: * * Free the kva allocation for a buffer. * */ static void bufkva_free(struct buf *bp) { #ifdef INVARIANTS if (bp->b_kvasize == 0) { KASSERT(bp->b_kvabase == unmapped_buf && bp->b_data == unmapped_buf, ("Leaked KVA space on %p", bp)); } else if (buf_mapped(bp)) BUF_CHECK_MAPPED(bp); else BUF_CHECK_UNMAPPED(bp); #endif if (bp->b_kvasize == 0) return; vmem_free(buffer_arena, (vm_offset_t)bp->b_kvabase, bp->b_kvasize); counter_u64_add(bufkvaspace, -bp->b_kvasize); counter_u64_add(buffreekvacnt, 1); bp->b_data = bp->b_kvabase = unmapped_buf; bp->b_kvasize = 0; } /* * bufkva_alloc: * * Allocate the buffer KVA and set b_kvasize and b_kvabase. */ static int bufkva_alloc(struct buf *bp, int maxsize, int gbflags) { vm_offset_t addr; int error; KASSERT((gbflags & GB_UNMAPPED) == 0 || (gbflags & GB_KVAALLOC) != 0, ("Invalid gbflags 0x%x in %s", gbflags, __func__)); bufkva_free(bp); addr = 0; error = vmem_alloc(buffer_arena, maxsize, M_BESTFIT | M_NOWAIT, &addr); if (error != 0) { /* * Buffer map is too fragmented. Request the caller * to defragment the map. */ return (error); } bp->b_kvabase = (caddr_t)addr; bp->b_kvasize = maxsize; counter_u64_add(bufkvaspace, bp->b_kvasize); if ((gbflags & GB_UNMAPPED) != 0) { bp->b_data = unmapped_buf; BUF_CHECK_UNMAPPED(bp); } else { bp->b_data = bp->b_kvabase; BUF_CHECK_MAPPED(bp); } return (0); } /* * bufkva_reclaim: * * Reclaim buffer kva by freeing buffers holding kva. This is a vmem * callback that fires to avoid returning failure. */ static void bufkva_reclaim(vmem_t *vmem, int flags) { bool done; int q; int i; done = false; for (i = 0; i < 5; i++) { for (q = 0; q < buf_domains; q++) if (buf_recycle(&bdomain[q], true) != 0) done = true; if (done) break; } return; } /* * Attempt to initiate asynchronous I/O on read-ahead blocks. We must * clear BIO_ERROR and B_INVAL prior to initiating I/O . If B_CACHE is set, * the buffer is valid and we do not have to do anything. */ static void breada(struct vnode * vp, daddr_t * rablkno, int * rabsize, int cnt, struct ucred * cred, int flags, void (*ckhashfunc)(struct buf *)) { struct buf *rabp; struct thread *td; int i; td = curthread; for (i = 0; i < cnt; i++, rablkno++, rabsize++) { if (inmem(vp, *rablkno)) continue; rabp = getblk(vp, *rablkno, *rabsize, 0, 0, 0); if ((rabp->b_flags & B_CACHE) != 0) { brelse(rabp); continue; } #ifdef RACCT if (racct_enable) { PROC_LOCK(curproc); racct_add_buf(curproc, rabp, 0); PROC_UNLOCK(curproc); } #endif /* RACCT */ td->td_ru.ru_inblock++; rabp->b_flags |= B_ASYNC; rabp->b_flags &= ~B_INVAL; if ((flags & GB_CKHASH) != 0) { rabp->b_flags |= B_CKHASH; rabp->b_ckhashcalc = ckhashfunc; } rabp->b_ioflags &= ~BIO_ERROR; rabp->b_iocmd = BIO_READ; if (rabp->b_rcred == NOCRED && cred != NOCRED) rabp->b_rcred = crhold(cred); vfs_busy_pages(rabp, 0); BUF_KERNPROC(rabp); rabp->b_iooffset = dbtob(rabp->b_blkno); bstrategy(rabp); } } /* * Entry point for bread() and breadn() via #defines in sys/buf.h. * * Get a buffer with the specified data. Look in the cache first. We * must clear BIO_ERROR and B_INVAL prior to initiating I/O. If B_CACHE * is set, the buffer is valid and we do not have to do anything, see * getblk(). Also starts asynchronous I/O on read-ahead blocks. * * Always return a NULL buffer pointer (in bpp) when returning an error. * * The blkno parameter is the logical block being requested. Normally * the mapping of logical block number to disk block address is done * by calling VOP_BMAP(). However, if the mapping is already known, the * disk block address can be passed using the dblkno parameter. If the * disk block address is not known, then the same value should be passed * for blkno and dblkno. */ int breadn_flags(struct vnode *vp, daddr_t blkno, daddr_t dblkno, int size, daddr_t *rablkno, int *rabsize, int cnt, struct ucred *cred, int flags, void (*ckhashfunc)(struct buf *), struct buf **bpp) { struct buf *bp; struct thread *td; int error, readwait, rv; CTR3(KTR_BUF, "breadn(%p, %jd, %d)", vp, blkno, size); td = curthread; /* * Can only return NULL if GB_LOCK_NOWAIT or GB_SPARSE flags * are specified. */ error = getblkx(vp, blkno, dblkno, size, 0, 0, flags, &bp); if (error != 0) { *bpp = NULL; return (error); } KASSERT(blkno == bp->b_lblkno, ("getblkx returned buffer for blkno %jd instead of blkno %jd", (intmax_t)bp->b_lblkno, (intmax_t)blkno)); flags &= ~GB_NOSPARSE; *bpp = bp; /* * If not found in cache, do some I/O */ readwait = 0; if ((bp->b_flags & B_CACHE) == 0) { #ifdef RACCT if (racct_enable) { PROC_LOCK(td->td_proc); racct_add_buf(td->td_proc, bp, 0); PROC_UNLOCK(td->td_proc); } #endif /* RACCT */ td->td_ru.ru_inblock++; bp->b_iocmd = BIO_READ; bp->b_flags &= ~B_INVAL; if ((flags & GB_CKHASH) != 0) { bp->b_flags |= B_CKHASH; bp->b_ckhashcalc = ckhashfunc; } if ((flags & GB_CVTENXIO) != 0) bp->b_xflags |= BX_CVTENXIO; bp->b_ioflags &= ~BIO_ERROR; if (bp->b_rcred == NOCRED && cred != NOCRED) bp->b_rcred = crhold(cred); vfs_busy_pages(bp, 0); bp->b_iooffset = dbtob(bp->b_blkno); bstrategy(bp); ++readwait; } /* * Attempt to initiate asynchronous I/O on read-ahead blocks. */ breada(vp, rablkno, rabsize, cnt, cred, flags, ckhashfunc); rv = 0; if (readwait) { rv = bufwait(bp); if (rv != 0) { brelse(bp); *bpp = NULL; } } return (rv); } /* * Write, release buffer on completion. (Done by iodone * if async). Do not bother writing anything if the buffer * is invalid. * * Note that we set B_CACHE here, indicating that buffer is * fully valid and thus cacheable. This is true even of NFS * now so we set it generally. This could be set either here * or in biodone() since the I/O is synchronous. We put it * here. */ int bufwrite(struct buf *bp) { int oldflags; struct vnode *vp; long space; int vp_md; CTR3(KTR_BUF, "bufwrite(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); if ((bp->b_bufobj->bo_flag & BO_DEAD) != 0) { bp->b_flags |= B_INVAL | B_RELBUF; bp->b_flags &= ~B_CACHE; brelse(bp); return (ENXIO); } if (bp->b_flags & B_INVAL) { brelse(bp); return (0); } if (bp->b_flags & B_BARRIER) atomic_add_long(&barrierwrites, 1); oldflags = bp->b_flags; KASSERT(!(bp->b_vflags & BV_BKGRDINPROG), ("FFS background buffer should not get here %p", bp)); vp = bp->b_vp; if (vp) vp_md = vp->v_vflag & VV_MD; else vp_md = 0; /* * Mark the buffer clean. Increment the bufobj write count * before bundirty() call, to prevent other thread from seeing * empty dirty list and zero counter for writes in progress, * falsely indicating that the bufobj is clean. */ bufobj_wref(bp->b_bufobj); bundirty(bp); bp->b_flags &= ~B_DONE; bp->b_ioflags &= ~BIO_ERROR; bp->b_flags |= B_CACHE; bp->b_iocmd = BIO_WRITE; vfs_busy_pages(bp, 1); /* * Normal bwrites pipeline writes */ bp->b_runningbufspace = bp->b_bufsize; space = atomic_fetchadd_long(&runningbufspace, bp->b_runningbufspace); #ifdef RACCT if (racct_enable) { PROC_LOCK(curproc); racct_add_buf(curproc, bp, 1); PROC_UNLOCK(curproc); } #endif /* RACCT */ curthread->td_ru.ru_oublock++; if (oldflags & B_ASYNC) BUF_KERNPROC(bp); bp->b_iooffset = dbtob(bp->b_blkno); buf_track(bp, __func__); bstrategy(bp); if ((oldflags & B_ASYNC) == 0) { int rtval = bufwait(bp); brelse(bp); return (rtval); } else if (space > hirunningspace) { /* * don't allow the async write to saturate the I/O * system. We will not deadlock here because * we are blocking waiting for I/O that is already in-progress * to complete. We do not block here if it is the update * or syncer daemon trying to clean up as that can lead * to deadlock. */ if ((curthread->td_pflags & TDP_NORUNNINGBUF) == 0 && !vp_md) waitrunningbufspace(); } return (0); } void bufbdflush(struct bufobj *bo, struct buf *bp) { struct buf *nbp; if (bo->bo_dirty.bv_cnt > dirtybufthresh + 10) { (void) VOP_FSYNC(bp->b_vp, MNT_NOWAIT, curthread); altbufferflushes++; } else if (bo->bo_dirty.bv_cnt > dirtybufthresh) { BO_LOCK(bo); /* * Try to find a buffer to flush. */ TAILQ_FOREACH(nbp, &bo->bo_dirty.bv_hd, b_bobufs) { if ((nbp->b_vflags & BV_BKGRDINPROG) || BUF_LOCK(nbp, LK_EXCLUSIVE | LK_NOWAIT, NULL)) continue; if (bp == nbp) panic("bdwrite: found ourselves"); BO_UNLOCK(bo); /* Don't countdeps with the bo lock held. */ if (buf_countdeps(nbp, 0)) { BO_LOCK(bo); BUF_UNLOCK(nbp); continue; } if (nbp->b_flags & B_CLUSTEROK) { vfs_bio_awrite(nbp); } else { bremfree(nbp); bawrite(nbp); } dirtybufferflushes++; break; } if (nbp == NULL) BO_UNLOCK(bo); } } /* * Delayed write. (Buffer is marked dirty). Do not bother writing * anything if the buffer is marked invalid. * * Note that since the buffer must be completely valid, we can safely * set B_CACHE. In fact, we have to set B_CACHE here rather then in * biodone() in order to prevent getblk from writing the buffer * out synchronously. */ void bdwrite(struct buf *bp) { struct thread *td = curthread; struct vnode *vp; struct bufobj *bo; CTR3(KTR_BUF, "bdwrite(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp)); KASSERT((bp->b_flags & B_BARRIER) == 0, ("Barrier request in delayed write %p", bp)); if (bp->b_flags & B_INVAL) { brelse(bp); return; } /* * If we have too many dirty buffers, don't create any more. * If we are wildly over our limit, then force a complete * cleanup. Otherwise, just keep the situation from getting * out of control. Note that we have to avoid a recursive * disaster and not try to clean up after our own cleanup! */ vp = bp->b_vp; bo = bp->b_bufobj; if ((td->td_pflags & (TDP_COWINPROGRESS|TDP_INBDFLUSH)) == 0) { td->td_pflags |= TDP_INBDFLUSH; BO_BDFLUSH(bo, bp); td->td_pflags &= ~TDP_INBDFLUSH; } else recursiveflushes++; bdirty(bp); /* * Set B_CACHE, indicating that the buffer is fully valid. This is * true even of NFS now. */ bp->b_flags |= B_CACHE; /* * This bmap keeps the system from needing to do the bmap later, * perhaps when the system is attempting to do a sync. Since it * is likely that the indirect block -- or whatever other datastructure * that the filesystem needs is still in memory now, it is a good * thing to do this. Note also, that if the pageout daemon is * requesting a sync -- there might not be enough memory to do * the bmap then... So, this is important to do. */ if (vp->v_type != VCHR && bp->b_lblkno == bp->b_blkno) { VOP_BMAP(vp, bp->b_lblkno, NULL, &bp->b_blkno, NULL, NULL); } buf_track(bp, __func__); /* * Set the *dirty* buffer range based upon the VM system dirty * pages. * * Mark the buffer pages as clean. We need to do this here to * satisfy the vnode_pager and the pageout daemon, so that it * thinks that the pages have been "cleaned". Note that since * the pages are in a delayed write buffer -- the VFS layer * "will" see that the pages get written out on the next sync, * or perhaps the cluster will be completed. */ vfs_clean_pages_dirty_buf(bp); bqrelse(bp); /* * note: we cannot initiate I/O from a bdwrite even if we wanted to, * due to the softdep code. */ } /* * bdirty: * * Turn buffer into delayed write request. We must clear BIO_READ and * B_RELBUF, and we must set B_DELWRI. We reassign the buffer to * itself to properly update it in the dirty/clean lists. We mark it * B_DONE to ensure that any asynchronization of the buffer properly * clears B_DONE ( else a panic will occur later ). * * bdirty() is kinda like bdwrite() - we have to clear B_INVAL which * might have been set pre-getblk(). Unlike bwrite/bdwrite, bdirty() * should only be called if the buffer is known-good. * * Since the buffer is not on a queue, we do not update the numfreebuffers * count. * * The buffer must be on QUEUE_NONE. */ void bdirty(struct buf *bp) { CTR3(KTR_BUF, "bdirty(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp)); KASSERT(bp->b_flags & B_REMFREE || bp->b_qindex == QUEUE_NONE, ("bdirty: buffer %p still on queue %d", bp, bp->b_qindex)); bp->b_flags &= ~(B_RELBUF); bp->b_iocmd = BIO_WRITE; if ((bp->b_flags & B_DELWRI) == 0) { bp->b_flags |= /* XXX B_DONE | */ B_DELWRI; reassignbuf(bp); bdirtyadd(bp); } } /* * bundirty: * * Clear B_DELWRI for buffer. * * Since the buffer is not on a queue, we do not update the numfreebuffers * count. * * The buffer must be on QUEUE_NONE. */ void bundirty(struct buf *bp) { CTR3(KTR_BUF, "bundirty(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp)); KASSERT(bp->b_flags & B_REMFREE || bp->b_qindex == QUEUE_NONE, ("bundirty: buffer %p still on queue %d", bp, bp->b_qindex)); if (bp->b_flags & B_DELWRI) { bp->b_flags &= ~B_DELWRI; reassignbuf(bp); bdirtysub(bp); } /* * Since it is now being written, we can clear its deferred write flag. */ bp->b_flags &= ~B_DEFERRED; } /* * bawrite: * * Asynchronous write. Start output on a buffer, but do not wait for * it to complete. The buffer is released when the output completes. * * bwrite() ( or the VOP routine anyway ) is responsible for handling * B_INVAL buffers. Not us. */ void bawrite(struct buf *bp) { bp->b_flags |= B_ASYNC; (void) bwrite(bp); } /* * babarrierwrite: * * Asynchronous barrier write. Start output on a buffer, but do not * wait for it to complete. Place a write barrier after this write so * that this buffer and all buffers written before it are committed to * the disk before any buffers written after this write are committed * to the disk. The buffer is released when the output completes. */ void babarrierwrite(struct buf *bp) { bp->b_flags |= B_ASYNC | B_BARRIER; (void) bwrite(bp); } /* * bbarrierwrite: * * Synchronous barrier write. Start output on a buffer and wait for * it to complete. Place a write barrier after this write so that * this buffer and all buffers written before it are committed to * the disk before any buffers written after this write are committed * to the disk. The buffer is released when the output completes. */ int bbarrierwrite(struct buf *bp) { bp->b_flags |= B_BARRIER; return (bwrite(bp)); } /* * bwillwrite: * * Called prior to the locking of any vnodes when we are expecting to * write. We do not want to starve the buffer cache with too many * dirty buffers so we block here. By blocking prior to the locking * of any vnodes we attempt to avoid the situation where a locked vnode * prevents the various system daemons from flushing related buffers. */ void bwillwrite(void) { if (buf_dirty_count_severe()) { mtx_lock(&bdirtylock); while (buf_dirty_count_severe()) { bdirtywait = 1; msleep(&bdirtywait, &bdirtylock, (PRIBIO + 4), "flswai", 0); } mtx_unlock(&bdirtylock); } } /* * Return true if we have too many dirty buffers. */ int buf_dirty_count_severe(void) { return (!BIT_EMPTY(BUF_DOMAINS, &bdhidirty)); } /* * brelse: * * Release a busy buffer and, if requested, free its resources. The * buffer will be stashed in the appropriate bufqueue[] allowing it * to be accessed later as a cache entity or reused for other purposes. */ void brelse(struct buf *bp) { struct mount *v_mnt; int qindex; /* * Many functions erroneously call brelse with a NULL bp under rare * error conditions. Simply return when called with a NULL bp. */ if (bp == NULL) return; CTR3(KTR_BUF, "brelse(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); KASSERT(!(bp->b_flags & (B_CLUSTER|B_PAGING)), ("brelse: inappropriate B_PAGING or B_CLUSTER bp %p", bp)); KASSERT((bp->b_flags & B_VMIO) != 0 || (bp->b_flags & B_NOREUSE) == 0, ("brelse: non-VMIO buffer marked NOREUSE")); if (BUF_LOCKRECURSED(bp)) { /* * Do not process, in particular, do not handle the * B_INVAL/B_RELBUF and do not release to free list. */ BUF_UNLOCK(bp); return; } if (bp->b_flags & B_MANAGED) { bqrelse(bp); return; } if ((bp->b_vflags & (BV_BKGRDINPROG | BV_BKGRDERR)) == BV_BKGRDERR) { BO_LOCK(bp->b_bufobj); bp->b_vflags &= ~BV_BKGRDERR; BO_UNLOCK(bp->b_bufobj); bdirty(bp); } if (bp->b_iocmd == BIO_WRITE && (bp->b_ioflags & BIO_ERROR) && (bp->b_flags & B_INVALONERR)) { /* * Forced invalidation of dirty buffer contents, to be used * after a failed write in the rare case that the loss of the * contents is acceptable. The buffer is invalidated and * freed. */ bp->b_flags |= B_INVAL | B_RELBUF | B_NOCACHE; bp->b_flags &= ~(B_ASYNC | B_CACHE); } if (bp->b_iocmd == BIO_WRITE && (bp->b_ioflags & BIO_ERROR) && (bp->b_error != ENXIO || !LIST_EMPTY(&bp->b_dep)) && !(bp->b_flags & B_INVAL)) { /* * Failed write, redirty. All errors except ENXIO (which * means the device is gone) are treated as being * transient. * * XXX Treating EIO as transient is not correct; the * contract with the local storage device drivers is that * they will only return EIO once the I/O is no longer * retriable. Network I/O also respects this through the * guarantees of TCP and/or the internal retries of NFS. * ENOMEM might be transient, but we also have no way of * knowing when its ok to retry/reschedule. In general, * this entire case should be made obsolete through better * error handling/recovery and resource scheduling. * * Do this also for buffers that failed with ENXIO, but have * non-empty dependencies - the soft updates code might need * to access the buffer to untangle them. * * Must clear BIO_ERROR to prevent pages from being scrapped. */ bp->b_ioflags &= ~BIO_ERROR; bdirty(bp); } else if ((bp->b_flags & (B_NOCACHE | B_INVAL)) || (bp->b_ioflags & BIO_ERROR) || (bp->b_bufsize <= 0)) { /* * Either a failed read I/O, or we were asked to free or not * cache the buffer, or we failed to write to a device that's * no longer present. */ bp->b_flags |= B_INVAL; if (!LIST_EMPTY(&bp->b_dep)) buf_deallocate(bp); if (bp->b_flags & B_DELWRI) bdirtysub(bp); bp->b_flags &= ~(B_DELWRI | B_CACHE); if ((bp->b_flags & B_VMIO) == 0) { allocbuf(bp, 0); if (bp->b_vp) brelvp(bp); } } /* * We must clear B_RELBUF if B_DELWRI is set. If vfs_vmio_truncate() * is called with B_DELWRI set, the underlying pages may wind up * getting freed causing a previous write (bdwrite()) to get 'lost' * because pages associated with a B_DELWRI bp are marked clean. * * We still allow the B_INVAL case to call vfs_vmio_truncate(), even * if B_DELWRI is set. */ if (bp->b_flags & B_DELWRI) bp->b_flags &= ~B_RELBUF; /* * VMIO buffer rundown. It is not very necessary to keep a VMIO buffer * constituted, not even NFS buffers now. Two flags effect this. If * B_INVAL, the struct buf is invalidated but the VM object is kept * around ( i.e. so it is trivial to reconstitute the buffer later ). * * If BIO_ERROR or B_NOCACHE is set, pages in the VM object will be * invalidated. BIO_ERROR cannot be set for a failed write unless the * buffer is also B_INVAL because it hits the re-dirtying code above. * * Normally we can do this whether a buffer is B_DELWRI or not. If * the buffer is an NFS buffer, it is tracking piecemeal writes or * the commit state and we cannot afford to lose the buffer. If the * buffer has a background write in progress, we need to keep it * around to prevent it from being reconstituted and starting a second * background write. */ v_mnt = bp->b_vp != NULL ? bp->b_vp->v_mount : NULL; if ((bp->b_flags & B_VMIO) && (bp->b_flags & B_NOCACHE || (bp->b_ioflags & BIO_ERROR && bp->b_iocmd == BIO_READ)) && (v_mnt == NULL || (v_mnt->mnt_vfc->vfc_flags & VFCF_NETWORK) == 0 || - vn_isdisk(bp->b_vp, NULL) || (bp->b_flags & B_DELWRI) == 0)) { + vn_isdisk(bp->b_vp) || (bp->b_flags & B_DELWRI) == 0)) { vfs_vmio_invalidate(bp); allocbuf(bp, 0); } if ((bp->b_flags & (B_INVAL | B_RELBUF)) != 0 || (bp->b_flags & (B_DELWRI | B_NOREUSE)) == B_NOREUSE) { allocbuf(bp, 0); bp->b_flags &= ~B_NOREUSE; if (bp->b_vp != NULL) brelvp(bp); } /* * If the buffer has junk contents signal it and eventually * clean up B_DELWRI and diassociate the vnode so that gbincore() * doesn't find it. */ if (bp->b_bufsize == 0 || (bp->b_ioflags & BIO_ERROR) != 0 || (bp->b_flags & (B_INVAL | B_NOCACHE | B_RELBUF)) != 0) bp->b_flags |= B_INVAL; if (bp->b_flags & B_INVAL) { if (bp->b_flags & B_DELWRI) bundirty(bp); if (bp->b_vp) brelvp(bp); } buf_track(bp, __func__); /* buffers with no memory */ if (bp->b_bufsize == 0) { buf_free(bp); return; } /* buffers with junk contents */ if (bp->b_flags & (B_INVAL | B_NOCACHE | B_RELBUF) || (bp->b_ioflags & BIO_ERROR)) { bp->b_xflags &= ~(BX_BKGRDWRITE | BX_ALTDATA); if (bp->b_vflags & BV_BKGRDINPROG) panic("losing buffer 2"); qindex = QUEUE_CLEAN; bp->b_flags |= B_AGE; /* remaining buffers */ } else if (bp->b_flags & B_DELWRI) qindex = QUEUE_DIRTY; else qindex = QUEUE_CLEAN; if ((bp->b_flags & B_DELWRI) == 0 && (bp->b_xflags & BX_VNDIRTY)) panic("brelse: not dirty"); bp->b_flags &= ~(B_ASYNC | B_NOCACHE | B_RELBUF | B_DIRECT); bp->b_xflags &= ~(BX_CVTENXIO); /* binsfree unlocks bp. */ binsfree(bp, qindex); } /* * Release a buffer back to the appropriate queue but do not try to free * it. The buffer is expected to be used again soon. * * bqrelse() is used by bdwrite() to requeue a delayed write, and used by * biodone() to requeue an async I/O on completion. It is also used when * known good buffers need to be requeued but we think we may need the data * again soon. * * XXX we should be able to leave the B_RELBUF hint set on completion. */ void bqrelse(struct buf *bp) { int qindex; CTR3(KTR_BUF, "bqrelse(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); KASSERT(!(bp->b_flags & (B_CLUSTER|B_PAGING)), ("bqrelse: inappropriate B_PAGING or B_CLUSTER bp %p", bp)); qindex = QUEUE_NONE; if (BUF_LOCKRECURSED(bp)) { /* do not release to free list */ BUF_UNLOCK(bp); return; } bp->b_flags &= ~(B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF); bp->b_xflags &= ~(BX_CVTENXIO); if (bp->b_flags & B_MANAGED) { if (bp->b_flags & B_REMFREE) bremfreef(bp); goto out; } /* buffers with stale but valid contents */ if ((bp->b_flags & B_DELWRI) != 0 || (bp->b_vflags & (BV_BKGRDINPROG | BV_BKGRDERR)) == BV_BKGRDERR) { BO_LOCK(bp->b_bufobj); bp->b_vflags &= ~BV_BKGRDERR; BO_UNLOCK(bp->b_bufobj); qindex = QUEUE_DIRTY; } else { if ((bp->b_flags & B_DELWRI) == 0 && (bp->b_xflags & BX_VNDIRTY)) panic("bqrelse: not dirty"); if ((bp->b_flags & B_NOREUSE) != 0) { brelse(bp); return; } qindex = QUEUE_CLEAN; } buf_track(bp, __func__); /* binsfree unlocks bp. */ binsfree(bp, qindex); return; out: buf_track(bp, __func__); /* unlock */ BUF_UNLOCK(bp); } /* * Complete I/O to a VMIO backed page. Validate the pages as appropriate, * restore bogus pages. */ static void vfs_vmio_iodone(struct buf *bp) { vm_ooffset_t foff; vm_page_t m; vm_object_t obj; struct vnode *vp __unused; int i, iosize, resid; bool bogus; obj = bp->b_bufobj->bo_object; KASSERT(blockcount_read(&obj->paging_in_progress) >= bp->b_npages, ("vfs_vmio_iodone: paging in progress(%d) < b_npages(%d)", blockcount_read(&obj->paging_in_progress), bp->b_npages)); vp = bp->b_vp; VNPASS(vp->v_holdcnt > 0, vp); VNPASS(vp->v_object != NULL, vp); foff = bp->b_offset; KASSERT(bp->b_offset != NOOFFSET, ("vfs_vmio_iodone: bp %p has no buffer offset", bp)); bogus = false; iosize = bp->b_bcount - bp->b_resid; for (i = 0; i < bp->b_npages; i++) { resid = ((foff + PAGE_SIZE) & ~(off_t)PAGE_MASK) - foff; if (resid > iosize) resid = iosize; /* * cleanup bogus pages, restoring the originals */ m = bp->b_pages[i]; if (m == bogus_page) { bogus = true; m = vm_page_relookup(obj, OFF_TO_IDX(foff)); if (m == NULL) panic("biodone: page disappeared!"); bp->b_pages[i] = m; } else if ((bp->b_iocmd == BIO_READ) && resid > 0) { /* * In the write case, the valid and clean bits are * already changed correctly ( see bdwrite() ), so we * only need to do this here in the read case. */ KASSERT((m->dirty & vm_page_bits(foff & PAGE_MASK, resid)) == 0, ("vfs_vmio_iodone: page %p " "has unexpected dirty bits", m)); vfs_page_set_valid(bp, foff, m); } KASSERT(OFF_TO_IDX(foff) == m->pindex, ("vfs_vmio_iodone: foff(%jd)/pindex(%ju) mismatch", (intmax_t)foff, (uintmax_t)m->pindex)); vm_page_sunbusy(m); foff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK; iosize -= resid; } vm_object_pip_wakeupn(obj, bp->b_npages); if (bogus && buf_mapped(bp)) { BUF_CHECK_MAPPED(bp); pmap_qenter(trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages); } } /* * Perform page invalidation when a buffer is released. The fully invalid * pages will be reclaimed later in vfs_vmio_truncate(). */ static void vfs_vmio_invalidate(struct buf *bp) { vm_object_t obj; vm_page_t m; int flags, i, resid, poffset, presid; if (buf_mapped(bp)) { BUF_CHECK_MAPPED(bp); pmap_qremove(trunc_page((vm_offset_t)bp->b_data), bp->b_npages); } else BUF_CHECK_UNMAPPED(bp); /* * Get the base offset and length of the buffer. Note that * in the VMIO case if the buffer block size is not * page-aligned then b_data pointer may not be page-aligned. * But our b_pages[] array *IS* page aligned. * * block sizes less then DEV_BSIZE (usually 512) are not * supported due to the page granularity bits (m->valid, * m->dirty, etc...). * * See man buf(9) for more information */ flags = (bp->b_flags & B_NOREUSE) != 0 ? VPR_NOREUSE : 0; obj = bp->b_bufobj->bo_object; resid = bp->b_bufsize; poffset = bp->b_offset & PAGE_MASK; VM_OBJECT_WLOCK(obj); for (i = 0; i < bp->b_npages; i++) { m = bp->b_pages[i]; if (m == bogus_page) panic("vfs_vmio_invalidate: Unexpected bogus page."); bp->b_pages[i] = NULL; presid = resid > (PAGE_SIZE - poffset) ? (PAGE_SIZE - poffset) : resid; KASSERT(presid >= 0, ("brelse: extra page")); vm_page_busy_acquire(m, VM_ALLOC_SBUSY); if (pmap_page_wired_mappings(m) == 0) vm_page_set_invalid(m, poffset, presid); vm_page_sunbusy(m); vm_page_release_locked(m, flags); resid -= presid; poffset = 0; } VM_OBJECT_WUNLOCK(obj); bp->b_npages = 0; } /* * Page-granular truncation of an existing VMIO buffer. */ static void vfs_vmio_truncate(struct buf *bp, int desiredpages) { vm_object_t obj; vm_page_t m; int flags, i; if (bp->b_npages == desiredpages) return; if (buf_mapped(bp)) { BUF_CHECK_MAPPED(bp); pmap_qremove((vm_offset_t)trunc_page((vm_offset_t)bp->b_data) + (desiredpages << PAGE_SHIFT), bp->b_npages - desiredpages); } else BUF_CHECK_UNMAPPED(bp); /* * The object lock is needed only if we will attempt to free pages. */ flags = (bp->b_flags & B_NOREUSE) != 0 ? VPR_NOREUSE : 0; if ((bp->b_flags & B_DIRECT) != 0) { flags |= VPR_TRYFREE; obj = bp->b_bufobj->bo_object; VM_OBJECT_WLOCK(obj); } else { obj = NULL; } for (i = desiredpages; i < bp->b_npages; i++) { m = bp->b_pages[i]; KASSERT(m != bogus_page, ("allocbuf: bogus page found")); bp->b_pages[i] = NULL; if (obj != NULL) vm_page_release_locked(m, flags); else vm_page_release(m, flags); } if (obj != NULL) VM_OBJECT_WUNLOCK(obj); bp->b_npages = desiredpages; } /* * Byte granular extension of VMIO buffers. */ static void vfs_vmio_extend(struct buf *bp, int desiredpages, int size) { /* * We are growing the buffer, possibly in a * byte-granular fashion. */ vm_object_t obj; vm_offset_t toff; vm_offset_t tinc; vm_page_t m; /* * Step 1, bring in the VM pages from the object, allocating * them if necessary. We must clear B_CACHE if these pages * are not valid for the range covered by the buffer. */ obj = bp->b_bufobj->bo_object; if (bp->b_npages < desiredpages) { /* * We must allocate system pages since blocking * here could interfere with paging I/O, no * matter which process we are. * * Only exclusive busy can be tested here. * Blocking on shared busy might lead to * deadlocks once allocbuf() is called after * pages are vfs_busy_pages(). */ (void)vm_page_grab_pages_unlocked(obj, OFF_TO_IDX(bp->b_offset) + bp->b_npages, VM_ALLOC_SYSTEM | VM_ALLOC_IGN_SBUSY | VM_ALLOC_NOBUSY | VM_ALLOC_WIRED, &bp->b_pages[bp->b_npages], desiredpages - bp->b_npages); bp->b_npages = desiredpages; } /* * Step 2. We've loaded the pages into the buffer, * we have to figure out if we can still have B_CACHE * set. Note that B_CACHE is set according to the * byte-granular range ( bcount and size ), not the * aligned range ( newbsize ). * * The VM test is against m->valid, which is DEV_BSIZE * aligned. Needless to say, the validity of the data * needs to also be DEV_BSIZE aligned. Note that this * fails with NFS if the server or some other client * extends the file's EOF. If our buffer is resized, * B_CACHE may remain set! XXX */ toff = bp->b_bcount; tinc = PAGE_SIZE - ((bp->b_offset + toff) & PAGE_MASK); while ((bp->b_flags & B_CACHE) && toff < size) { vm_pindex_t pi; if (tinc > (size - toff)) tinc = size - toff; pi = ((bp->b_offset & PAGE_MASK) + toff) >> PAGE_SHIFT; m = bp->b_pages[pi]; vfs_buf_test_cache(bp, bp->b_offset, toff, tinc, m); toff += tinc; tinc = PAGE_SIZE; } /* * Step 3, fixup the KVA pmap. */ if (buf_mapped(bp)) bpmap_qenter(bp); else BUF_CHECK_UNMAPPED(bp); } /* * Check to see if a block at a particular lbn is available for a clustered * write. */ static int vfs_bio_clcheck(struct vnode *vp, int size, daddr_t lblkno, daddr_t blkno) { struct buf *bpa; int match; match = 0; /* If the buf isn't in core skip it */ if ((bpa = gbincore(&vp->v_bufobj, lblkno)) == NULL) return (0); /* If the buf is busy we don't want to wait for it */ if (BUF_LOCK(bpa, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0) return (0); /* Only cluster with valid clusterable delayed write buffers */ if ((bpa->b_flags & (B_DELWRI | B_CLUSTEROK | B_INVAL)) != (B_DELWRI | B_CLUSTEROK)) goto done; if (bpa->b_bufsize != size) goto done; /* * Check to see if it is in the expected place on disk and that the * block has been mapped. */ if ((bpa->b_blkno != bpa->b_lblkno) && (bpa->b_blkno == blkno)) match = 1; done: BUF_UNLOCK(bpa); return (match); } /* * vfs_bio_awrite: * * Implement clustered async writes for clearing out B_DELWRI buffers. * This is much better then the old way of writing only one buffer at * a time. Note that we may not be presented with the buffers in the * correct order, so we search for the cluster in both directions. */ int vfs_bio_awrite(struct buf *bp) { struct bufobj *bo; int i; int j; daddr_t lblkno = bp->b_lblkno; struct vnode *vp = bp->b_vp; int ncl; int nwritten; int size; int maxcl; int gbflags; bo = &vp->v_bufobj; gbflags = (bp->b_data == unmapped_buf) ? GB_UNMAPPED : 0; /* * right now we support clustered writing only to regular files. If * we find a clusterable block we could be in the middle of a cluster * rather then at the beginning. */ if ((vp->v_type == VREG) && (vp->v_mount != 0) && /* Only on nodes that have the size info */ (bp->b_flags & (B_CLUSTEROK | B_INVAL)) == B_CLUSTEROK) { size = vp->v_mount->mnt_stat.f_iosize; maxcl = MAXPHYS / size; BO_RLOCK(bo); for (i = 1; i < maxcl; i++) if (vfs_bio_clcheck(vp, size, lblkno + i, bp->b_blkno + ((i * size) >> DEV_BSHIFT)) == 0) break; for (j = 1; i + j <= maxcl && j <= lblkno; j++) if (vfs_bio_clcheck(vp, size, lblkno - j, bp->b_blkno - ((j * size) >> DEV_BSHIFT)) == 0) break; BO_RUNLOCK(bo); --j; ncl = i + j; /* * this is a possible cluster write */ if (ncl != 1) { BUF_UNLOCK(bp); nwritten = cluster_wbuild(vp, size, lblkno - j, ncl, gbflags); return (nwritten); } } bremfree(bp); bp->b_flags |= B_ASYNC; /* * default (old) behavior, writing out only one block * * XXX returns b_bufsize instead of b_bcount for nwritten? */ nwritten = bp->b_bufsize; (void) bwrite(bp); return (nwritten); } /* * getnewbuf_kva: * * Allocate KVA for an empty buf header according to gbflags. */ static int getnewbuf_kva(struct buf *bp, int gbflags, int maxsize) { if ((gbflags & (GB_UNMAPPED | GB_KVAALLOC)) != GB_UNMAPPED) { /* * In order to keep fragmentation sane we only allocate kva * in BKVASIZE chunks. XXX with vmem we can do page size. */ maxsize = (maxsize + BKVAMASK) & ~BKVAMASK; if (maxsize != bp->b_kvasize && bufkva_alloc(bp, maxsize, gbflags)) return (ENOSPC); } return (0); } /* * getnewbuf: * * Find and initialize a new buffer header, freeing up existing buffers * in the bufqueues as necessary. The new buffer is returned locked. * * We block if: * We have insufficient buffer headers * We have insufficient buffer space * buffer_arena is too fragmented ( space reservation fails ) * If we have to flush dirty buffers ( but we try to avoid this ) * * The caller is responsible for releasing the reserved bufspace after * allocbuf() is called. */ static struct buf * getnewbuf(struct vnode *vp, int slpflag, int slptimeo, int maxsize, int gbflags) { struct bufdomain *bd; struct buf *bp; bool metadata, reserved; bp = NULL; KASSERT((gbflags & (GB_UNMAPPED | GB_KVAALLOC)) != GB_KVAALLOC, ("GB_KVAALLOC only makes sense with GB_UNMAPPED")); if (!unmapped_buf_allowed) gbflags &= ~(GB_UNMAPPED | GB_KVAALLOC); if (vp == NULL || (vp->v_vflag & (VV_MD | VV_SYSTEM)) != 0 || vp->v_type == VCHR) metadata = true; else metadata = false; if (vp == NULL) bd = &bdomain[0]; else bd = &bdomain[vp->v_bufobj.bo_domain]; counter_u64_add(getnewbufcalls, 1); reserved = false; do { if (reserved == false && bufspace_reserve(bd, maxsize, metadata) != 0) { counter_u64_add(getnewbufrestarts, 1); continue; } reserved = true; if ((bp = buf_alloc(bd)) == NULL) { counter_u64_add(getnewbufrestarts, 1); continue; } if (getnewbuf_kva(bp, gbflags, maxsize) == 0) return (bp); break; } while (buf_recycle(bd, false) == 0); if (reserved) bufspace_release(bd, maxsize); if (bp != NULL) { bp->b_flags |= B_INVAL; brelse(bp); } bufspace_wait(bd, vp, gbflags, slpflag, slptimeo); return (NULL); } /* * buf_daemon: * * buffer flushing daemon. Buffers are normally flushed by the * update daemon but if it cannot keep up this process starts to * take the load in an attempt to prevent getnewbuf() from blocking. */ static struct kproc_desc buf_kp = { "bufdaemon", buf_daemon, &bufdaemonproc }; SYSINIT(bufdaemon, SI_SUB_KTHREAD_BUF, SI_ORDER_FIRST, kproc_start, &buf_kp); static int buf_flush(struct vnode *vp, struct bufdomain *bd, int target) { int flushed; flushed = flushbufqueues(vp, bd, target, 0); if (flushed == 0) { /* * Could not find any buffers without rollback * dependencies, so just write the first one * in the hopes of eventually making progress. */ if (vp != NULL && target > 2) target /= 2; flushbufqueues(vp, bd, target, 1); } return (flushed); } static void buf_daemon() { struct bufdomain *bd; int speedupreq; int lodirty; int i; /* * This process needs to be suspended prior to shutdown sync. */ EVENTHANDLER_REGISTER(shutdown_pre_sync, kthread_shutdown, curthread, SHUTDOWN_PRI_LAST + 100); /* * Start the buf clean daemons as children threads. */ for (i = 0 ; i < buf_domains; i++) { int error; error = kthread_add((void (*)(void *))bufspace_daemon, &bdomain[i], curproc, NULL, 0, 0, "bufspacedaemon-%d", i); if (error) panic("error %d spawning bufspace daemon", error); } /* * This process is allowed to take the buffer cache to the limit */ curthread->td_pflags |= TDP_NORUNNINGBUF | TDP_BUFNEED; mtx_lock(&bdlock); for (;;) { bd_request = 0; mtx_unlock(&bdlock); kthread_suspend_check(); /* * Save speedupreq for this pass and reset to capture new * requests. */ speedupreq = bd_speedupreq; bd_speedupreq = 0; /* * Flush each domain sequentially according to its level and * the speedup request. */ for (i = 0; i < buf_domains; i++) { bd = &bdomain[i]; if (speedupreq) lodirty = bd->bd_numdirtybuffers / 2; else lodirty = bd->bd_lodirtybuffers; while (bd->bd_numdirtybuffers > lodirty) { if (buf_flush(NULL, bd, bd->bd_numdirtybuffers - lodirty) == 0) break; kern_yield(PRI_USER); } } /* * Only clear bd_request if we have reached our low water * mark. The buf_daemon normally waits 1 second and * then incrementally flushes any dirty buffers that have * built up, within reason. * * If we were unable to hit our low water mark and couldn't * find any flushable buffers, we sleep for a short period * to avoid endless loops on unlockable buffers. */ mtx_lock(&bdlock); if (!BIT_EMPTY(BUF_DOMAINS, &bdlodirty)) { /* * We reached our low water mark, reset the * request and sleep until we are needed again. * The sleep is just so the suspend code works. */ bd_request = 0; /* * Do an extra wakeup in case dirty threshold * changed via sysctl and the explicit transition * out of shortfall was missed. */ bdirtywakeup(); if (runningbufspace <= lorunningspace) runningwakeup(); msleep(&bd_request, &bdlock, PVM, "psleep", hz); } else { /* * We couldn't find any flushable dirty buffers but * still have too many dirty buffers, we * have to sleep and try again. (rare) */ msleep(&bd_request, &bdlock, PVM, "qsleep", hz / 10); } } } /* * flushbufqueues: * * Try to flush a buffer in the dirty queue. We must be careful to * free up B_INVAL buffers instead of write them, which NFS is * particularly sensitive to. */ static int flushwithdeps = 0; SYSCTL_INT(_vfs, OID_AUTO, flushwithdeps, CTLFLAG_RW | CTLFLAG_STATS, &flushwithdeps, 0, "Number of buffers flushed with dependecies that require rollbacks"); static int flushbufqueues(struct vnode *lvp, struct bufdomain *bd, int target, int flushdeps) { struct bufqueue *bq; struct buf *sentinel; struct vnode *vp; struct mount *mp; struct buf *bp; int hasdeps; int flushed; int error; bool unlock; flushed = 0; bq = &bd->bd_dirtyq; bp = NULL; sentinel = malloc(sizeof(struct buf), M_TEMP, M_WAITOK | M_ZERO); sentinel->b_qindex = QUEUE_SENTINEL; BQ_LOCK(bq); TAILQ_INSERT_HEAD(&bq->bq_queue, sentinel, b_freelist); BQ_UNLOCK(bq); while (flushed != target) { maybe_yield(); BQ_LOCK(bq); bp = TAILQ_NEXT(sentinel, b_freelist); if (bp != NULL) { TAILQ_REMOVE(&bq->bq_queue, sentinel, b_freelist); TAILQ_INSERT_AFTER(&bq->bq_queue, bp, sentinel, b_freelist); } else { BQ_UNLOCK(bq); break; } /* * Skip sentinels inserted by other invocations of the * flushbufqueues(), taking care to not reorder them. * * Only flush the buffers that belong to the * vnode locked by the curthread. */ if (bp->b_qindex == QUEUE_SENTINEL || (lvp != NULL && bp->b_vp != lvp)) { BQ_UNLOCK(bq); continue; } error = BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL); BQ_UNLOCK(bq); if (error != 0) continue; /* * BKGRDINPROG can only be set with the buf and bufobj * locks both held. We tolerate a race to clear it here. */ if ((bp->b_vflags & BV_BKGRDINPROG) != 0 || (bp->b_flags & B_DELWRI) == 0) { BUF_UNLOCK(bp); continue; } if (bp->b_flags & B_INVAL) { bremfreef(bp); brelse(bp); flushed++; continue; } if (!LIST_EMPTY(&bp->b_dep) && buf_countdeps(bp, 0)) { if (flushdeps == 0) { BUF_UNLOCK(bp); continue; } hasdeps = 1; } else hasdeps = 0; /* * We must hold the lock on a vnode before writing * one of its buffers. Otherwise we may confuse, or * in the case of a snapshot vnode, deadlock the * system. * * The lock order here is the reverse of the normal * of vnode followed by buf lock. This is ok because * the NOWAIT will prevent deadlock. */ vp = bp->b_vp; if (vn_start_write(vp, &mp, V_NOWAIT) != 0) { BUF_UNLOCK(bp); continue; } if (lvp == NULL) { unlock = true; error = vn_lock(vp, LK_EXCLUSIVE | LK_NOWAIT); } else { ASSERT_VOP_LOCKED(vp, "getbuf"); unlock = false; error = VOP_ISLOCKED(vp) == LK_EXCLUSIVE ? 0 : vn_lock(vp, LK_TRYUPGRADE); } if (error == 0) { CTR3(KTR_BUF, "flushbufqueue(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); if (curproc == bufdaemonproc) { vfs_bio_awrite(bp); } else { bremfree(bp); bwrite(bp); counter_u64_add(notbufdflushes, 1); } vn_finished_write(mp); if (unlock) VOP_UNLOCK(vp); flushwithdeps += hasdeps; flushed++; /* * Sleeping on runningbufspace while holding * vnode lock leads to deadlock. */ if (curproc == bufdaemonproc && runningbufspace > hirunningspace) waitrunningbufspace(); continue; } vn_finished_write(mp); BUF_UNLOCK(bp); } BQ_LOCK(bq); TAILQ_REMOVE(&bq->bq_queue, sentinel, b_freelist); BQ_UNLOCK(bq); free(sentinel, M_TEMP); return (flushed); } /* * Check to see if a block is currently memory resident. */ struct buf * incore(struct bufobj *bo, daddr_t blkno) { return (gbincore_unlocked(bo, blkno)); } /* * Returns true if no I/O is needed to access the * associated VM object. This is like incore except * it also hunts around in the VM system for the data. */ static int inmem(struct vnode * vp, daddr_t blkno) { vm_object_t obj; vm_offset_t toff, tinc, size; vm_page_t m; vm_ooffset_t off; ASSERT_VOP_LOCKED(vp, "inmem"); if (incore(&vp->v_bufobj, blkno)) return 1; if (vp->v_mount == NULL) return 0; obj = vp->v_object; if (obj == NULL) return (0); size = PAGE_SIZE; if (size > vp->v_mount->mnt_stat.f_iosize) size = vp->v_mount->mnt_stat.f_iosize; off = (vm_ooffset_t)blkno * (vm_ooffset_t)vp->v_mount->mnt_stat.f_iosize; VM_OBJECT_RLOCK(obj); for (toff = 0; toff < vp->v_mount->mnt_stat.f_iosize; toff += tinc) { m = vm_page_lookup(obj, OFF_TO_IDX(off + toff)); if (!m) goto notinmem; tinc = size; if (tinc > PAGE_SIZE - ((toff + off) & PAGE_MASK)) tinc = PAGE_SIZE - ((toff + off) & PAGE_MASK); if (vm_page_is_valid(m, (vm_offset_t) ((toff + off) & PAGE_MASK), tinc) == 0) goto notinmem; } VM_OBJECT_RUNLOCK(obj); return 1; notinmem: VM_OBJECT_RUNLOCK(obj); return (0); } /* * Set the dirty range for a buffer based on the status of the dirty * bits in the pages comprising the buffer. The range is limited * to the size of the buffer. * * Tell the VM system that the pages associated with this buffer * are clean. This is used for delayed writes where the data is * going to go to disk eventually without additional VM intevention. * * Note that while we only really need to clean through to b_bcount, we * just go ahead and clean through to b_bufsize. */ static void vfs_clean_pages_dirty_buf(struct buf *bp) { vm_ooffset_t foff, noff, eoff; vm_page_t m; int i; if ((bp->b_flags & B_VMIO) == 0 || bp->b_bufsize == 0) return; foff = bp->b_offset; KASSERT(bp->b_offset != NOOFFSET, ("vfs_clean_pages_dirty_buf: no buffer offset")); vfs_busy_pages_acquire(bp); vfs_setdirty_range(bp); for (i = 0; i < bp->b_npages; i++) { noff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK; eoff = noff; if (eoff > bp->b_offset + bp->b_bufsize) eoff = bp->b_offset + bp->b_bufsize; m = bp->b_pages[i]; vfs_page_set_validclean(bp, foff, m); /* vm_page_clear_dirty(m, foff & PAGE_MASK, eoff - foff); */ foff = noff; } vfs_busy_pages_release(bp); } static void vfs_setdirty_range(struct buf *bp) { vm_offset_t boffset; vm_offset_t eoffset; int i; /* * test the pages to see if they have been modified directly * by users through the VM system. */ for (i = 0; i < bp->b_npages; i++) vm_page_test_dirty(bp->b_pages[i]); /* * Calculate the encompassing dirty range, boffset and eoffset, * (eoffset - boffset) bytes. */ for (i = 0; i < bp->b_npages; i++) { if (bp->b_pages[i]->dirty) break; } boffset = (i << PAGE_SHIFT) - (bp->b_offset & PAGE_MASK); for (i = bp->b_npages - 1; i >= 0; --i) { if (bp->b_pages[i]->dirty) { break; } } eoffset = ((i + 1) << PAGE_SHIFT) - (bp->b_offset & PAGE_MASK); /* * Fit it to the buffer. */ if (eoffset > bp->b_bcount) eoffset = bp->b_bcount; /* * If we have a good dirty range, merge with the existing * dirty range. */ if (boffset < eoffset) { if (bp->b_dirtyoff > boffset) bp->b_dirtyoff = boffset; if (bp->b_dirtyend < eoffset) bp->b_dirtyend = eoffset; } } /* * Allocate the KVA mapping for an existing buffer. * If an unmapped buffer is provided but a mapped buffer is requested, take * also care to properly setup mappings between pages and KVA. */ static void bp_unmapped_get_kva(struct buf *bp, daddr_t blkno, int size, int gbflags) { int bsize, maxsize, need_mapping, need_kva; off_t offset; need_mapping = bp->b_data == unmapped_buf && (gbflags & GB_UNMAPPED) == 0; need_kva = bp->b_kvabase == unmapped_buf && bp->b_data == unmapped_buf && (gbflags & GB_KVAALLOC) != 0; if (!need_mapping && !need_kva) return; BUF_CHECK_UNMAPPED(bp); if (need_mapping && bp->b_kvabase != unmapped_buf) { /* * Buffer is not mapped, but the KVA was already * reserved at the time of the instantiation. Use the * allocated space. */ goto has_addr; } /* * Calculate the amount of the address space we would reserve * if the buffer was mapped. */ - bsize = vn_isdisk(bp->b_vp, NULL) ? DEV_BSIZE : bp->b_bufobj->bo_bsize; + bsize = vn_isdisk(bp->b_vp) ? DEV_BSIZE : bp->b_bufobj->bo_bsize; KASSERT(bsize != 0, ("bsize == 0, check bo->bo_bsize")); offset = blkno * bsize; maxsize = size + (offset & PAGE_MASK); maxsize = imax(maxsize, bsize); while (bufkva_alloc(bp, maxsize, gbflags) != 0) { if ((gbflags & GB_NOWAIT_BD) != 0) { /* * XXXKIB: defragmentation cannot * succeed, not sure what else to do. */ panic("GB_NOWAIT_BD and GB_UNMAPPED %p", bp); } counter_u64_add(mappingrestarts, 1); bufspace_wait(bufdomain(bp), bp->b_vp, gbflags, 0, 0); } has_addr: if (need_mapping) { /* b_offset is handled by bpmap_qenter. */ bp->b_data = bp->b_kvabase; BUF_CHECK_MAPPED(bp); bpmap_qenter(bp); } } struct buf * getblk(struct vnode *vp, daddr_t blkno, int size, int slpflag, int slptimeo, int flags) { struct buf *bp; int error; error = getblkx(vp, blkno, blkno, size, slpflag, slptimeo, flags, &bp); if (error != 0) return (NULL); return (bp); } /* * getblkx: * * Get a block given a specified block and offset into a file/device. * The buffers B_DONE bit will be cleared on return, making it almost * ready for an I/O initiation. B_INVAL may or may not be set on * return. The caller should clear B_INVAL prior to initiating a * READ. * * For a non-VMIO buffer, B_CACHE is set to the opposite of B_INVAL for * an existing buffer. * * For a VMIO buffer, B_CACHE is modified according to the backing VM. * If getblk()ing a previously 0-sized invalid buffer, B_CACHE is set * and then cleared based on the backing VM. If the previous buffer is * non-0-sized but invalid, B_CACHE will be cleared. * * If getblk() must create a new buffer, the new buffer is returned with * both B_INVAL and B_CACHE clear unless it is a VMIO buffer, in which * case it is returned with B_INVAL clear and B_CACHE set based on the * backing VM. * * getblk() also forces a bwrite() for any B_DELWRI buffer whose * B_CACHE bit is clear. * * What this means, basically, is that the caller should use B_CACHE to * determine whether the buffer is fully valid or not and should clear * B_INVAL prior to issuing a read. If the caller intends to validate * the buffer by loading its data area with something, the caller needs * to clear B_INVAL. If the caller does this without issuing an I/O, * the caller should set B_CACHE ( as an optimization ), else the caller * should issue the I/O and biodone() will set B_CACHE if the I/O was * a write attempt or if it was a successful read. If the caller * intends to issue a READ, the caller must clear B_INVAL and BIO_ERROR * prior to issuing the READ. biodone() will *not* clear B_INVAL. * * The blkno parameter is the logical block being requested. Normally * the mapping of logical block number to disk block address is done * by calling VOP_BMAP(). However, if the mapping is already known, the * disk block address can be passed using the dblkno parameter. If the * disk block address is not known, then the same value should be passed * for blkno and dblkno. */ int getblkx(struct vnode *vp, daddr_t blkno, daddr_t dblkno, int size, int slpflag, int slptimeo, int flags, struct buf **bpp) { struct buf *bp; struct bufobj *bo; daddr_t d_blkno; int bsize, error, maxsize, vmio; off_t offset; CTR3(KTR_BUF, "getblk(%p, %ld, %d)", vp, (long)blkno, size); KASSERT((flags & (GB_UNMAPPED | GB_KVAALLOC)) != GB_KVAALLOC, ("GB_KVAALLOC only makes sense with GB_UNMAPPED")); ASSERT_VOP_LOCKED(vp, "getblk"); if (size > maxbcachebuf) panic("getblk: size(%d) > maxbcachebuf(%d)\n", size, maxbcachebuf); if (!unmapped_buf_allowed) flags &= ~(GB_UNMAPPED | GB_KVAALLOC); bo = &vp->v_bufobj; d_blkno = dblkno; /* Attempt lockless lookup first. */ bp = gbincore_unlocked(bo, blkno); if (bp == NULL) goto newbuf_unlocked; error = BUF_TIMELOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL, "getblku", 0, 0); if (error != 0) goto loop; /* Verify buf identify has not changed since lookup. */ if (bp->b_bufobj == bo && bp->b_lblkno == blkno) goto foundbuf_fastpath; /* It changed, fallback to locked lookup. */ BUF_UNLOCK_RAW(bp); loop: BO_RLOCK(bo); bp = gbincore(bo, blkno); if (bp != NULL) { int lockflags; /* * Buffer is in-core. If the buffer is not busy nor managed, * it must be on a queue. */ lockflags = LK_EXCLUSIVE | LK_INTERLOCK | ((flags & GB_LOCK_NOWAIT) ? LK_NOWAIT : LK_SLEEPFAIL); error = BUF_TIMELOCK(bp, lockflags, BO_LOCKPTR(bo), "getblk", slpflag, slptimeo); /* * If we slept and got the lock we have to restart in case * the buffer changed identities. */ if (error == ENOLCK) goto loop; /* We timed out or were interrupted. */ else if (error != 0) return (error); foundbuf_fastpath: /* If recursed, assume caller knows the rules. */ if (BUF_LOCKRECURSED(bp)) goto end; /* * The buffer is locked. B_CACHE is cleared if the buffer is * invalid. Otherwise, for a non-VMIO buffer, B_CACHE is set * and for a VMIO buffer B_CACHE is adjusted according to the * backing VM cache. */ if (bp->b_flags & B_INVAL) bp->b_flags &= ~B_CACHE; else if ((bp->b_flags & (B_VMIO | B_INVAL)) == 0) bp->b_flags |= B_CACHE; if (bp->b_flags & B_MANAGED) MPASS(bp->b_qindex == QUEUE_NONE); else bremfree(bp); /* * check for size inconsistencies for non-VMIO case. */ if (bp->b_bcount != size) { if ((bp->b_flags & B_VMIO) == 0 || (size > bp->b_kvasize)) { if (bp->b_flags & B_DELWRI) { bp->b_flags |= B_NOCACHE; bwrite(bp); } else { if (LIST_EMPTY(&bp->b_dep)) { bp->b_flags |= B_RELBUF; brelse(bp); } else { bp->b_flags |= B_NOCACHE; bwrite(bp); } } goto loop; } } /* * Handle the case of unmapped buffer which should * become mapped, or the buffer for which KVA * reservation is requested. */ bp_unmapped_get_kva(bp, blkno, size, flags); /* * If the size is inconsistent in the VMIO case, we can resize * the buffer. This might lead to B_CACHE getting set or * cleared. If the size has not changed, B_CACHE remains * unchanged from its previous state. */ allocbuf(bp, size); KASSERT(bp->b_offset != NOOFFSET, ("getblk: no buffer offset")); /* * A buffer with B_DELWRI set and B_CACHE clear must * be committed before we can return the buffer in * order to prevent the caller from issuing a read * ( due to B_CACHE not being set ) and overwriting * it. * * Most callers, including NFS and FFS, need this to * operate properly either because they assume they * can issue a read if B_CACHE is not set, or because * ( for example ) an uncached B_DELWRI might loop due * to softupdates re-dirtying the buffer. In the latter * case, B_CACHE is set after the first write completes, * preventing further loops. * NOTE! b*write() sets B_CACHE. If we cleared B_CACHE * above while extending the buffer, we cannot allow the * buffer to remain with B_CACHE set after the write * completes or it will represent a corrupt state. To * deal with this we set B_NOCACHE to scrap the buffer * after the write. * * We might be able to do something fancy, like setting * B_CACHE in bwrite() except if B_DELWRI is already set, * so the below call doesn't set B_CACHE, but that gets real * confusing. This is much easier. */ if ((bp->b_flags & (B_CACHE|B_DELWRI)) == B_DELWRI) { bp->b_flags |= B_NOCACHE; bwrite(bp); goto loop; } bp->b_flags &= ~B_DONE; } else { /* * Buffer is not in-core, create new buffer. The buffer * returned by getnewbuf() is locked. Note that the returned * buffer is also considered valid (not marked B_INVAL). */ BO_RUNLOCK(bo); newbuf_unlocked: /* * If the user does not want us to create the buffer, bail out * here. */ if (flags & GB_NOCREAT) return (EEXIST); - bsize = vn_isdisk(vp, NULL) ? DEV_BSIZE : bo->bo_bsize; + bsize = vn_isdisk(vp) ? DEV_BSIZE : bo->bo_bsize; KASSERT(bsize != 0, ("bsize == 0, check bo->bo_bsize")); offset = blkno * bsize; vmio = vp->v_object != NULL; if (vmio) { maxsize = size + (offset & PAGE_MASK); } else { maxsize = size; /* Do not allow non-VMIO notmapped buffers. */ flags &= ~(GB_UNMAPPED | GB_KVAALLOC); } maxsize = imax(maxsize, bsize); if ((flags & GB_NOSPARSE) != 0 && vmio && - !vn_isdisk(vp, NULL)) { + !vn_isdisk(vp)) { error = VOP_BMAP(vp, blkno, NULL, &d_blkno, 0, 0); KASSERT(error != EOPNOTSUPP, ("GB_NOSPARSE from fs not supporting bmap, vp %p", vp)); if (error != 0) return (error); if (d_blkno == -1) return (EJUSTRETURN); } bp = getnewbuf(vp, slpflag, slptimeo, maxsize, flags); if (bp == NULL) { if (slpflag || slptimeo) return (ETIMEDOUT); /* * XXX This is here until the sleep path is diagnosed * enough to work under very low memory conditions. * * There's an issue on low memory, 4BSD+non-preempt * systems (eg MIPS routers with 32MB RAM) where buffer * exhaustion occurs without sleeping for buffer * reclaimation. This just sticks in a loop and * constantly attempts to allocate a buffer, which * hits exhaustion and tries to wakeup bufdaemon. * This never happens because we never yield. * * The real solution is to identify and fix these cases * so we aren't effectively busy-waiting in a loop * until the reclaimation path has cycles to run. */ kern_yield(PRI_USER); goto loop; } /* * This code is used to make sure that a buffer is not * created while the getnewbuf routine is blocked. * This can be a problem whether the vnode is locked or not. * If the buffer is created out from under us, we have to * throw away the one we just created. * * Note: this must occur before we associate the buffer * with the vp especially considering limitations in * the splay tree implementation when dealing with duplicate * lblkno's. */ BO_LOCK(bo); if (gbincore(bo, blkno)) { BO_UNLOCK(bo); bp->b_flags |= B_INVAL; bufspace_release(bufdomain(bp), maxsize); brelse(bp); goto loop; } /* * Insert the buffer into the hash, so that it can * be found by incore. */ bp->b_lblkno = blkno; bp->b_blkno = d_blkno; bp->b_offset = offset; bgetvp(vp, bp); BO_UNLOCK(bo); /* * set B_VMIO bit. allocbuf() the buffer bigger. Since the * buffer size starts out as 0, B_CACHE will be set by * allocbuf() for the VMIO case prior to it testing the * backing store for validity. */ if (vmio) { bp->b_flags |= B_VMIO; KASSERT(vp->v_object == bp->b_bufobj->bo_object, ("ARGH! different b_bufobj->bo_object %p %p %p\n", bp, vp->v_object, bp->b_bufobj->bo_object)); } else { bp->b_flags &= ~B_VMIO; KASSERT(bp->b_bufobj->bo_object == NULL, ("ARGH! has b_bufobj->bo_object %p %p\n", bp, bp->b_bufobj->bo_object)); BUF_CHECK_MAPPED(bp); } allocbuf(bp, size); bufspace_release(bufdomain(bp), maxsize); bp->b_flags &= ~B_DONE; } CTR4(KTR_BUF, "getblk(%p, %ld, %d) = %p", vp, (long)blkno, size, bp); end: buf_track(bp, __func__); KASSERT(bp->b_bufobj == bo, ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); *bpp = bp; return (0); } /* * Get an empty, disassociated buffer of given size. The buffer is initially * set to B_INVAL. */ struct buf * geteblk(int size, int flags) { struct buf *bp; int maxsize; maxsize = (size + BKVAMASK) & ~BKVAMASK; while ((bp = getnewbuf(NULL, 0, 0, maxsize, flags)) == NULL) { if ((flags & GB_NOWAIT_BD) && (curthread->td_pflags & TDP_BUFNEED) != 0) return (NULL); } allocbuf(bp, size); bufspace_release(bufdomain(bp), maxsize); bp->b_flags |= B_INVAL; /* b_dep cleared by getnewbuf() */ return (bp); } /* * Truncate the backing store for a non-vmio buffer. */ static void vfs_nonvmio_truncate(struct buf *bp, int newbsize) { if (bp->b_flags & B_MALLOC) { /* * malloced buffers are not shrunk */ if (newbsize == 0) { bufmallocadjust(bp, 0); free(bp->b_data, M_BIOBUF); bp->b_data = bp->b_kvabase; bp->b_flags &= ~B_MALLOC; } return; } vm_hold_free_pages(bp, newbsize); bufspace_adjust(bp, newbsize); } /* * Extend the backing for a non-VMIO buffer. */ static void vfs_nonvmio_extend(struct buf *bp, int newbsize) { caddr_t origbuf; int origbufsize; /* * We only use malloced memory on the first allocation. * and revert to page-allocated memory when the buffer * grows. * * There is a potential smp race here that could lead * to bufmallocspace slightly passing the max. It * is probably extremely rare and not worth worrying * over. */ if (bp->b_bufsize == 0 && newbsize <= PAGE_SIZE/2 && bufmallocspace < maxbufmallocspace) { bp->b_data = malloc(newbsize, M_BIOBUF, M_WAITOK); bp->b_flags |= B_MALLOC; bufmallocadjust(bp, newbsize); return; } /* * If the buffer is growing on its other-than-first * allocation then we revert to the page-allocation * scheme. */ origbuf = NULL; origbufsize = 0; if (bp->b_flags & B_MALLOC) { origbuf = bp->b_data; origbufsize = bp->b_bufsize; bp->b_data = bp->b_kvabase; bufmallocadjust(bp, 0); bp->b_flags &= ~B_MALLOC; newbsize = round_page(newbsize); } vm_hold_load_pages(bp, (vm_offset_t) bp->b_data + bp->b_bufsize, (vm_offset_t) bp->b_data + newbsize); if (origbuf != NULL) { bcopy(origbuf, bp->b_data, origbufsize); free(origbuf, M_BIOBUF); } bufspace_adjust(bp, newbsize); } /* * This code constitutes the buffer memory from either anonymous system * memory (in the case of non-VMIO operations) or from an associated * VM object (in the case of VMIO operations). This code is able to * resize a buffer up or down. * * Note that this code is tricky, and has many complications to resolve * deadlock or inconsistent data situations. Tread lightly!!! * There are B_CACHE and B_DELWRI interactions that must be dealt with by * the caller. Calling this code willy nilly can result in the loss of data. * * allocbuf() only adjusts B_CACHE for VMIO buffers. getblk() deals with * B_CACHE for the non-VMIO case. */ int allocbuf(struct buf *bp, int size) { int newbsize; if (bp->b_bcount == size) return (1); if (bp->b_kvasize != 0 && bp->b_kvasize < size) panic("allocbuf: buffer too small"); newbsize = roundup2(size, DEV_BSIZE); if ((bp->b_flags & B_VMIO) == 0) { if ((bp->b_flags & B_MALLOC) == 0) newbsize = round_page(newbsize); /* * Just get anonymous memory from the kernel. Don't * mess with B_CACHE. */ if (newbsize < bp->b_bufsize) vfs_nonvmio_truncate(bp, newbsize); else if (newbsize > bp->b_bufsize) vfs_nonvmio_extend(bp, newbsize); } else { int desiredpages; desiredpages = (size == 0) ? 0 : num_pages((bp->b_offset & PAGE_MASK) + newbsize); if (bp->b_flags & B_MALLOC) panic("allocbuf: VMIO buffer can't be malloced"); /* * Set B_CACHE initially if buffer is 0 length or will become * 0-length. */ if (size == 0 || bp->b_bufsize == 0) bp->b_flags |= B_CACHE; if (newbsize < bp->b_bufsize) vfs_vmio_truncate(bp, desiredpages); /* XXX This looks as if it should be newbsize > b_bufsize */ else if (size > bp->b_bcount) vfs_vmio_extend(bp, desiredpages, size); bufspace_adjust(bp, newbsize); } bp->b_bcount = size; /* requested buffer size. */ return (1); } extern int inflight_transient_maps; static struct bio_queue nondump_bios; void biodone(struct bio *bp) { struct mtx *mtxp; void (*done)(struct bio *); vm_offset_t start, end; biotrack(bp, __func__); /* * Avoid completing I/O when dumping after a panic since that may * result in a deadlock in the filesystem or pager code. Note that * this doesn't affect dumps that were started manually since we aim * to keep the system usable after it has been resumed. */ if (__predict_false(dumping && SCHEDULER_STOPPED())) { TAILQ_INSERT_HEAD(&nondump_bios, bp, bio_queue); return; } if ((bp->bio_flags & BIO_TRANSIENT_MAPPING) != 0) { bp->bio_flags &= ~BIO_TRANSIENT_MAPPING; bp->bio_flags |= BIO_UNMAPPED; start = trunc_page((vm_offset_t)bp->bio_data); end = round_page((vm_offset_t)bp->bio_data + bp->bio_length); bp->bio_data = unmapped_buf; pmap_qremove(start, atop(end - start)); vmem_free(transient_arena, start, end - start); atomic_add_int(&inflight_transient_maps, -1); } done = bp->bio_done; if (done == NULL) { mtxp = mtx_pool_find(mtxpool_sleep, bp); mtx_lock(mtxp); bp->bio_flags |= BIO_DONE; wakeup(bp); mtx_unlock(mtxp); } else done(bp); } /* * Wait for a BIO to finish. */ int biowait(struct bio *bp, const char *wchan) { struct mtx *mtxp; mtxp = mtx_pool_find(mtxpool_sleep, bp); mtx_lock(mtxp); while ((bp->bio_flags & BIO_DONE) == 0) msleep(bp, mtxp, PRIBIO, wchan, 0); mtx_unlock(mtxp); if (bp->bio_error != 0) return (bp->bio_error); if (!(bp->bio_flags & BIO_ERROR)) return (0); return (EIO); } void biofinish(struct bio *bp, struct devstat *stat, int error) { if (error) { bp->bio_error = error; bp->bio_flags |= BIO_ERROR; } if (stat != NULL) devstat_end_transaction_bio(stat, bp); biodone(bp); } #if defined(BUF_TRACKING) || defined(FULL_BUF_TRACKING) void biotrack_buf(struct bio *bp, const char *location) { buf_track(bp->bio_track_bp, location); } #endif /* * bufwait: * * Wait for buffer I/O completion, returning error status. The buffer * is left locked and B_DONE on return. B_EINTR is converted into an EINTR * error and cleared. */ int bufwait(struct buf *bp) { if (bp->b_iocmd == BIO_READ) bwait(bp, PRIBIO, "biord"); else bwait(bp, PRIBIO, "biowr"); if (bp->b_flags & B_EINTR) { bp->b_flags &= ~B_EINTR; return (EINTR); } if (bp->b_ioflags & BIO_ERROR) { return (bp->b_error ? bp->b_error : EIO); } else { return (0); } } /* * bufdone: * * Finish I/O on a buffer, optionally calling a completion function. * This is usually called from an interrupt so process blocking is * not allowed. * * biodone is also responsible for setting B_CACHE in a B_VMIO bp. * In a non-VMIO bp, B_CACHE will be set on the next getblk() * assuming B_INVAL is clear. * * For the VMIO case, we set B_CACHE if the op was a read and no * read error occurred, or if the op was a write. B_CACHE is never * set if the buffer is invalid or otherwise uncacheable. * * bufdone does not mess with B_INVAL, allowing the I/O routine or the * initiator to leave B_INVAL set to brelse the buffer out of existence * in the biodone routine. */ void bufdone(struct buf *bp) { struct bufobj *dropobj; void (*biodone)(struct buf *); buf_track(bp, __func__); CTR3(KTR_BUF, "bufdone(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); dropobj = NULL; KASSERT(!(bp->b_flags & B_DONE), ("biodone: bp %p already done", bp)); runningbufwakeup(bp); if (bp->b_iocmd == BIO_WRITE) dropobj = bp->b_bufobj; /* call optional completion function if requested */ if (bp->b_iodone != NULL) { biodone = bp->b_iodone; bp->b_iodone = NULL; (*biodone) (bp); if (dropobj) bufobj_wdrop(dropobj); return; } if (bp->b_flags & B_VMIO) { /* * Set B_CACHE if the op was a normal read and no error * occurred. B_CACHE is set for writes in the b*write() * routines. */ if (bp->b_iocmd == BIO_READ && !(bp->b_flags & (B_INVAL|B_NOCACHE)) && !(bp->b_ioflags & BIO_ERROR)) bp->b_flags |= B_CACHE; vfs_vmio_iodone(bp); } if (!LIST_EMPTY(&bp->b_dep)) buf_complete(bp); if ((bp->b_flags & B_CKHASH) != 0) { KASSERT(bp->b_iocmd == BIO_READ, ("bufdone: b_iocmd %d not BIO_READ", bp->b_iocmd)); KASSERT(buf_mapped(bp), ("bufdone: bp %p not mapped", bp)); (*bp->b_ckhashcalc)(bp); } /* * For asynchronous completions, release the buffer now. The brelse * will do a wakeup there if necessary - so no need to do a wakeup * here in the async case. The sync case always needs to do a wakeup. */ if (bp->b_flags & B_ASYNC) { if ((bp->b_flags & (B_NOCACHE | B_INVAL | B_RELBUF)) || (bp->b_ioflags & BIO_ERROR)) brelse(bp); else bqrelse(bp); } else bdone(bp); if (dropobj) bufobj_wdrop(dropobj); } /* * This routine is called in lieu of iodone in the case of * incomplete I/O. This keeps the busy status for pages * consistent. */ void vfs_unbusy_pages(struct buf *bp) { int i; vm_object_t obj; vm_page_t m; runningbufwakeup(bp); if (!(bp->b_flags & B_VMIO)) return; obj = bp->b_bufobj->bo_object; for (i = 0; i < bp->b_npages; i++) { m = bp->b_pages[i]; if (m == bogus_page) { m = vm_page_relookup(obj, OFF_TO_IDX(bp->b_offset) + i); if (!m) panic("vfs_unbusy_pages: page missing\n"); bp->b_pages[i] = m; if (buf_mapped(bp)) { BUF_CHECK_MAPPED(bp); pmap_qenter(trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages); } else BUF_CHECK_UNMAPPED(bp); } vm_page_sunbusy(m); } vm_object_pip_wakeupn(obj, bp->b_npages); } /* * vfs_page_set_valid: * * Set the valid bits in a page based on the supplied offset. The * range is restricted to the buffer's size. * * This routine is typically called after a read completes. */ static void vfs_page_set_valid(struct buf *bp, vm_ooffset_t off, vm_page_t m) { vm_ooffset_t eoff; /* * Compute the end offset, eoff, such that [off, eoff) does not span a * page boundary and eoff is not greater than the end of the buffer. * The end of the buffer, in this case, is our file EOF, not the * allocation size of the buffer. */ eoff = (off + PAGE_SIZE) & ~(vm_ooffset_t)PAGE_MASK; if (eoff > bp->b_offset + bp->b_bcount) eoff = bp->b_offset + bp->b_bcount; /* * Set valid range. This is typically the entire buffer and thus the * entire page. */ if (eoff > off) vm_page_set_valid_range(m, off & PAGE_MASK, eoff - off); } /* * vfs_page_set_validclean: * * Set the valid bits and clear the dirty bits in a page based on the * supplied offset. The range is restricted to the buffer's size. */ static void vfs_page_set_validclean(struct buf *bp, vm_ooffset_t off, vm_page_t m) { vm_ooffset_t soff, eoff; /* * Start and end offsets in buffer. eoff - soff may not cross a * page boundary or cross the end of the buffer. The end of the * buffer, in this case, is our file EOF, not the allocation size * of the buffer. */ soff = off; eoff = (off + PAGE_SIZE) & ~(off_t)PAGE_MASK; if (eoff > bp->b_offset + bp->b_bcount) eoff = bp->b_offset + bp->b_bcount; /* * Set valid range. This is typically the entire buffer and thus the * entire page. */ if (eoff > soff) { vm_page_set_validclean( m, (vm_offset_t) (soff & PAGE_MASK), (vm_offset_t) (eoff - soff) ); } } /* * Acquire a shared busy on all pages in the buf. */ void vfs_busy_pages_acquire(struct buf *bp) { int i; for (i = 0; i < bp->b_npages; i++) vm_page_busy_acquire(bp->b_pages[i], VM_ALLOC_SBUSY); } void vfs_busy_pages_release(struct buf *bp) { int i; for (i = 0; i < bp->b_npages; i++) vm_page_sunbusy(bp->b_pages[i]); } /* * This routine is called before a device strategy routine. * It is used to tell the VM system that paging I/O is in * progress, and treat the pages associated with the buffer * almost as being exclusive busy. Also the object paging_in_progress * flag is handled to make sure that the object doesn't become * inconsistent. * * Since I/O has not been initiated yet, certain buffer flags * such as BIO_ERROR or B_INVAL may be in an inconsistent state * and should be ignored. */ void vfs_busy_pages(struct buf *bp, int clear_modify) { vm_object_t obj; vm_ooffset_t foff; vm_page_t m; int i; bool bogus; if (!(bp->b_flags & B_VMIO)) return; obj = bp->b_bufobj->bo_object; foff = bp->b_offset; KASSERT(bp->b_offset != NOOFFSET, ("vfs_busy_pages: no buffer offset")); if ((bp->b_flags & B_CLUSTER) == 0) { vm_object_pip_add(obj, bp->b_npages); vfs_busy_pages_acquire(bp); } if (bp->b_bufsize != 0) vfs_setdirty_range(bp); bogus = false; for (i = 0; i < bp->b_npages; i++) { m = bp->b_pages[i]; vm_page_assert_sbusied(m); /* * When readying a buffer for a read ( i.e * clear_modify == 0 ), it is important to do * bogus_page replacement for valid pages in * partially instantiated buffers. Partially * instantiated buffers can, in turn, occur when * reconstituting a buffer from its VM backing store * base. We only have to do this if B_CACHE is * clear ( which causes the I/O to occur in the * first place ). The replacement prevents the read * I/O from overwriting potentially dirty VM-backed * pages. XXX bogus page replacement is, uh, bogus. * It may not work properly with small-block devices. * We need to find a better way. */ if (clear_modify) { pmap_remove_write(m); vfs_page_set_validclean(bp, foff, m); } else if (vm_page_all_valid(m) && (bp->b_flags & B_CACHE) == 0) { bp->b_pages[i] = bogus_page; bogus = true; } foff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK; } if (bogus && buf_mapped(bp)) { BUF_CHECK_MAPPED(bp); pmap_qenter(trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages); } } /* * vfs_bio_set_valid: * * Set the range within the buffer to valid. The range is * relative to the beginning of the buffer, b_offset. Note that * b_offset itself may be offset from the beginning of the first * page. */ void vfs_bio_set_valid(struct buf *bp, int base, int size) { int i, n; vm_page_t m; if (!(bp->b_flags & B_VMIO)) return; /* * Fixup base to be relative to beginning of first page. * Set initial n to be the maximum number of bytes in the * first page that can be validated. */ base += (bp->b_offset & PAGE_MASK); n = PAGE_SIZE - (base & PAGE_MASK); /* * Busy may not be strictly necessary here because the pages are * unlikely to be fully valid and the vnode lock will synchronize * their access via getpages. It is grabbed for consistency with * other page validation. */ vfs_busy_pages_acquire(bp); for (i = base / PAGE_SIZE; size > 0 && i < bp->b_npages; ++i) { m = bp->b_pages[i]; if (n > size) n = size; vm_page_set_valid_range(m, base & PAGE_MASK, n); base += n; size -= n; n = PAGE_SIZE; } vfs_busy_pages_release(bp); } /* * vfs_bio_clrbuf: * * If the specified buffer is a non-VMIO buffer, clear the entire * buffer. If the specified buffer is a VMIO buffer, clear and * validate only the previously invalid portions of the buffer. * This routine essentially fakes an I/O, so we need to clear * BIO_ERROR and B_INVAL. * * Note that while we only theoretically need to clear through b_bcount, * we go ahead and clear through b_bufsize. */ void vfs_bio_clrbuf(struct buf *bp) { int i, j, mask, sa, ea, slide; if ((bp->b_flags & (B_VMIO | B_MALLOC)) != B_VMIO) { clrbuf(bp); return; } bp->b_flags &= ~B_INVAL; bp->b_ioflags &= ~BIO_ERROR; vfs_busy_pages_acquire(bp); sa = bp->b_offset & PAGE_MASK; slide = 0; for (i = 0; i < bp->b_npages; i++, sa = 0) { slide = imin(slide + PAGE_SIZE, bp->b_offset + bp->b_bufsize); ea = slide & PAGE_MASK; if (ea == 0) ea = PAGE_SIZE; if (bp->b_pages[i] == bogus_page) continue; j = sa / DEV_BSIZE; mask = ((1 << ((ea - sa) / DEV_BSIZE)) - 1) << j; if ((bp->b_pages[i]->valid & mask) == mask) continue; if ((bp->b_pages[i]->valid & mask) == 0) pmap_zero_page_area(bp->b_pages[i], sa, ea - sa); else { for (; sa < ea; sa += DEV_BSIZE, j++) { if ((bp->b_pages[i]->valid & (1 << j)) == 0) { pmap_zero_page_area(bp->b_pages[i], sa, DEV_BSIZE); } } } vm_page_set_valid_range(bp->b_pages[i], j * DEV_BSIZE, roundup2(ea - sa, DEV_BSIZE)); } vfs_busy_pages_release(bp); bp->b_resid = 0; } void vfs_bio_bzero_buf(struct buf *bp, int base, int size) { vm_page_t m; int i, n; if (buf_mapped(bp)) { BUF_CHECK_MAPPED(bp); bzero(bp->b_data + base, size); } else { BUF_CHECK_UNMAPPED(bp); n = PAGE_SIZE - (base & PAGE_MASK); for (i = base / PAGE_SIZE; size > 0 && i < bp->b_npages; ++i) { m = bp->b_pages[i]; if (n > size) n = size; pmap_zero_page_area(m, base & PAGE_MASK, n); base += n; size -= n; n = PAGE_SIZE; } } } /* * Update buffer flags based on I/O request parameters, optionally releasing the * buffer. If it's VMIO or direct I/O, the buffer pages are released to the VM, * where they may be placed on a page queue (VMIO) or freed immediately (direct * I/O). Otherwise the buffer is released to the cache. */ static void b_io_dismiss(struct buf *bp, int ioflag, bool release) { KASSERT((ioflag & IO_NOREUSE) == 0 || (ioflag & IO_VMIO) != 0, ("buf %p non-VMIO noreuse", bp)); if ((ioflag & IO_DIRECT) != 0) bp->b_flags |= B_DIRECT; if ((ioflag & IO_EXT) != 0) bp->b_xflags |= BX_ALTDATA; if ((ioflag & (IO_VMIO | IO_DIRECT)) != 0 && LIST_EMPTY(&bp->b_dep)) { bp->b_flags |= B_RELBUF; if ((ioflag & IO_NOREUSE) != 0) bp->b_flags |= B_NOREUSE; if (release) brelse(bp); } else if (release) bqrelse(bp); } void vfs_bio_brelse(struct buf *bp, int ioflag) { b_io_dismiss(bp, ioflag, true); } void vfs_bio_set_flags(struct buf *bp, int ioflag) { b_io_dismiss(bp, ioflag, false); } /* * vm_hold_load_pages and vm_hold_free_pages get pages into * a buffers address space. The pages are anonymous and are * not associated with a file object. */ static void vm_hold_load_pages(struct buf *bp, vm_offset_t from, vm_offset_t to) { vm_offset_t pg; vm_page_t p; int index; BUF_CHECK_MAPPED(bp); to = round_page(to); from = round_page(from); index = (from - trunc_page((vm_offset_t)bp->b_data)) >> PAGE_SHIFT; for (pg = from; pg < to; pg += PAGE_SIZE, index++) { /* * note: must allocate system pages since blocking here * could interfere with paging I/O, no matter which * process we are. */ p = vm_page_alloc(NULL, 0, VM_ALLOC_SYSTEM | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_COUNT((to - pg) >> PAGE_SHIFT) | VM_ALLOC_WAITOK); pmap_qenter(pg, &p, 1); bp->b_pages[index] = p; } bp->b_npages = index; } /* Return pages associated with this buf to the vm system */ static void vm_hold_free_pages(struct buf *bp, int newbsize) { vm_offset_t from; vm_page_t p; int index, newnpages; BUF_CHECK_MAPPED(bp); from = round_page((vm_offset_t)bp->b_data + newbsize); newnpages = (from - trunc_page((vm_offset_t)bp->b_data)) >> PAGE_SHIFT; if (bp->b_npages > newnpages) pmap_qremove(from, bp->b_npages - newnpages); for (index = newnpages; index < bp->b_npages; index++) { p = bp->b_pages[index]; bp->b_pages[index] = NULL; vm_page_unwire_noq(p); vm_page_free(p); } bp->b_npages = newnpages; } /* * Map an IO request into kernel virtual address space. * * All requests are (re)mapped into kernel VA space. * Notice that we use b_bufsize for the size of the buffer * to be mapped. b_bcount might be modified by the driver. * * Note that even if the caller determines that the address space should * be valid, a race or a smaller-file mapped into a larger space may * actually cause vmapbuf() to fail, so all callers of vmapbuf() MUST * check the return value. * * This function only works with pager buffers. */ int vmapbuf(struct buf *bp, int mapbuf) { vm_prot_t prot; int pidx; if (bp->b_bufsize < 0) return (-1); prot = VM_PROT_READ; if (bp->b_iocmd == BIO_READ) prot |= VM_PROT_WRITE; /* Less backwards than it looks */ if ((pidx = vm_fault_quick_hold_pages(&curproc->p_vmspace->vm_map, (vm_offset_t)bp->b_data, bp->b_bufsize, prot, bp->b_pages, btoc(MAXPHYS))) < 0) return (-1); bp->b_npages = pidx; bp->b_offset = ((vm_offset_t)bp->b_data) & PAGE_MASK; if (mapbuf || !unmapped_buf_allowed) { pmap_qenter((vm_offset_t)bp->b_kvabase, bp->b_pages, pidx); bp->b_data = bp->b_kvabase + bp->b_offset; } else bp->b_data = unmapped_buf; return(0); } /* * Free the io map PTEs associated with this IO operation. * We also invalidate the TLB entries and restore the original b_addr. * * This function only works with pager buffers. */ void vunmapbuf(struct buf *bp) { int npages; npages = bp->b_npages; if (buf_mapped(bp)) pmap_qremove(trunc_page((vm_offset_t)bp->b_data), npages); vm_page_unhold_pages(bp->b_pages, npages); bp->b_data = unmapped_buf; } void bdone(struct buf *bp) { struct mtx *mtxp; mtxp = mtx_pool_find(mtxpool_sleep, bp); mtx_lock(mtxp); bp->b_flags |= B_DONE; wakeup(bp); mtx_unlock(mtxp); } void bwait(struct buf *bp, u_char pri, const char *wchan) { struct mtx *mtxp; mtxp = mtx_pool_find(mtxpool_sleep, bp); mtx_lock(mtxp); while ((bp->b_flags & B_DONE) == 0) msleep(bp, mtxp, pri, wchan, 0); mtx_unlock(mtxp); } int bufsync(struct bufobj *bo, int waitfor) { return (VOP_FSYNC(bo2vnode(bo), waitfor, curthread)); } void bufstrategy(struct bufobj *bo, struct buf *bp) { int i __unused; struct vnode *vp; vp = bp->b_vp; KASSERT(vp == bo->bo_private, ("Inconsistent vnode bufstrategy")); KASSERT(vp->v_type != VCHR && vp->v_type != VBLK, ("Wrong vnode in bufstrategy(bp=%p, vp=%p)", bp, vp)); i = VOP_STRATEGY(vp, bp); KASSERT(i == 0, ("VOP_STRATEGY failed bp=%p vp=%p", bp, bp->b_vp)); } /* * Initialize a struct bufobj before use. Memory is assumed zero filled. */ void bufobj_init(struct bufobj *bo, void *private) { static volatile int bufobj_cleanq; bo->bo_domain = atomic_fetchadd_int(&bufobj_cleanq, 1) % buf_domains; rw_init(BO_LOCKPTR(bo), "bufobj interlock"); bo->bo_private = private; TAILQ_INIT(&bo->bo_clean.bv_hd); TAILQ_INIT(&bo->bo_dirty.bv_hd); } void bufobj_wrefl(struct bufobj *bo) { KASSERT(bo != NULL, ("NULL bo in bufobj_wref")); ASSERT_BO_WLOCKED(bo); bo->bo_numoutput++; } void bufobj_wref(struct bufobj *bo) { KASSERT(bo != NULL, ("NULL bo in bufobj_wref")); BO_LOCK(bo); bo->bo_numoutput++; BO_UNLOCK(bo); } void bufobj_wdrop(struct bufobj *bo) { KASSERT(bo != NULL, ("NULL bo in bufobj_wdrop")); BO_LOCK(bo); KASSERT(bo->bo_numoutput > 0, ("bufobj_wdrop non-positive count")); if ((--bo->bo_numoutput == 0) && (bo->bo_flag & BO_WWAIT)) { bo->bo_flag &= ~BO_WWAIT; wakeup(&bo->bo_numoutput); } BO_UNLOCK(bo); } int bufobj_wwait(struct bufobj *bo, int slpflag, int timeo) { int error; KASSERT(bo != NULL, ("NULL bo in bufobj_wwait")); ASSERT_BO_WLOCKED(bo); error = 0; while (bo->bo_numoutput) { bo->bo_flag |= BO_WWAIT; error = msleep(&bo->bo_numoutput, BO_LOCKPTR(bo), slpflag | (PRIBIO + 1), "bo_wwait", timeo); if (error) break; } return (error); } /* * Set bio_data or bio_ma for struct bio from the struct buf. */ void bdata2bio(struct buf *bp, struct bio *bip) { if (!buf_mapped(bp)) { KASSERT(unmapped_buf_allowed, ("unmapped")); bip->bio_ma = bp->b_pages; bip->bio_ma_n = bp->b_npages; bip->bio_data = unmapped_buf; bip->bio_ma_offset = (vm_offset_t)bp->b_offset & PAGE_MASK; bip->bio_flags |= BIO_UNMAPPED; KASSERT(round_page(bip->bio_ma_offset + bip->bio_length) / PAGE_SIZE == bp->b_npages, ("Buffer %p too short: %d %lld %d", bp, bip->bio_ma_offset, (long long)bip->bio_length, bip->bio_ma_n)); } else { bip->bio_data = bp->b_data; bip->bio_ma = NULL; } } /* * The MIPS pmap code currently doesn't handle aliased pages. * The VIPT caches may not handle page aliasing themselves, leading * to data corruption. * * As such, this code makes a system extremely unhappy if said * system doesn't support unaliasing the above situation in hardware. * Some "recent" systems (eg some mips24k/mips74k cores) don't enable * this feature at build time, so it has to be handled in software. * * Once the MIPS pmap/cache code grows to support this function on * earlier chips, it should be flipped back off. */ #ifdef __mips__ static int buf_pager_relbuf = 1; #else static int buf_pager_relbuf = 0; #endif SYSCTL_INT(_vfs, OID_AUTO, buf_pager_relbuf, CTLFLAG_RWTUN, &buf_pager_relbuf, 0, "Make buffer pager release buffers after reading"); /* * The buffer pager. It uses buffer reads to validate pages. * * In contrast to the generic local pager from vm/vnode_pager.c, this * pager correctly and easily handles volumes where the underlying * device block size is greater than the machine page size. The * buffer cache transparently extends the requested page run to be * aligned at the block boundary, and does the necessary bogus page * replacements in the addends to avoid obliterating already valid * pages. * * The only non-trivial issue is that the exclusive busy state for * pages, which is assumed by the vm_pager_getpages() interface, is * incompatible with the VMIO buffer cache's desire to share-busy the * pages. This function performs a trivial downgrade of the pages' * state before reading buffers, and a less trivial upgrade from the * shared-busy to excl-busy state after the read. */ int vfs_bio_getpages(struct vnode *vp, vm_page_t *ma, int count, int *rbehind, int *rahead, vbg_get_lblkno_t get_lblkno, vbg_get_blksize_t get_blksize) { vm_page_t m; vm_object_t object; struct buf *bp; struct mount *mp; daddr_t lbn, lbnp; vm_ooffset_t la, lb, poff, poffe; long bsize; int bo_bs, br_flags, error, i, pgsin, pgsin_a, pgsin_b; bool redo, lpart; object = vp->v_object; mp = vp->v_mount; error = 0; la = IDX_TO_OFF(ma[count - 1]->pindex); if (la >= object->un_pager.vnp.vnp_size) return (VM_PAGER_BAD); /* * Change the meaning of la from where the last requested page starts * to where it ends, because that's the end of the requested region * and the start of the potential read-ahead region. */ la += PAGE_SIZE; lpart = la > object->un_pager.vnp.vnp_size; bo_bs = get_blksize(vp, get_lblkno(vp, IDX_TO_OFF(ma[0]->pindex))); /* * Calculate read-ahead, behind and total pages. */ pgsin = count; lb = IDX_TO_OFF(ma[0]->pindex); pgsin_b = OFF_TO_IDX(lb - rounddown2(lb, bo_bs)); pgsin += pgsin_b; if (rbehind != NULL) *rbehind = pgsin_b; pgsin_a = OFF_TO_IDX(roundup2(la, bo_bs) - la); if (la + IDX_TO_OFF(pgsin_a) >= object->un_pager.vnp.vnp_size) pgsin_a = OFF_TO_IDX(roundup2(object->un_pager.vnp.vnp_size, PAGE_SIZE) - la); pgsin += pgsin_a; if (rahead != NULL) *rahead = pgsin_a; VM_CNT_INC(v_vnodein); VM_CNT_ADD(v_vnodepgsin, pgsin); br_flags = (mp != NULL && (mp->mnt_kern_flag & MNTK_UNMAPPED_BUFS) != 0) ? GB_UNMAPPED : 0; again: for (i = 0; i < count; i++) { if (ma[i] != bogus_page) vm_page_busy_downgrade(ma[i]); } lbnp = -1; for (i = 0; i < count; i++) { m = ma[i]; if (m == bogus_page) continue; /* * Pages are shared busy and the object lock is not * owned, which together allow for the pages' * invalidation. The racy test for validity avoids * useless creation of the buffer for the most typical * case when invalidation is not used in redo or for * parallel read. The shared->excl upgrade loop at * the end of the function catches the race in a * reliable way (protected by the object lock). */ if (vm_page_all_valid(m)) continue; poff = IDX_TO_OFF(m->pindex); poffe = MIN(poff + PAGE_SIZE, object->un_pager.vnp.vnp_size); for (; poff < poffe; poff += bsize) { lbn = get_lblkno(vp, poff); if (lbn == lbnp) goto next_page; lbnp = lbn; bsize = get_blksize(vp, lbn); error = bread_gb(vp, lbn, bsize, curthread->td_ucred, br_flags, &bp); if (error != 0) goto end_pages; if (bp->b_rcred == curthread->td_ucred) { crfree(bp->b_rcred); bp->b_rcred = NOCRED; } if (LIST_EMPTY(&bp->b_dep)) { /* * Invalidation clears m->valid, but * may leave B_CACHE flag if the * buffer existed at the invalidation * time. In this case, recycle the * buffer to do real read on next * bread() after redo. * * Otherwise B_RELBUF is not strictly * necessary, enable to reduce buf * cache pressure. */ if (buf_pager_relbuf || !vm_page_all_valid(m)) bp->b_flags |= B_RELBUF; bp->b_flags &= ~B_NOCACHE; brelse(bp); } else { bqrelse(bp); } } KASSERT(1 /* racy, enable for debugging */ || vm_page_all_valid(m) || i == count - 1, ("buf %d %p invalid", i, m)); if (i == count - 1 && lpart) { if (!vm_page_none_valid(m) && !vm_page_all_valid(m)) vm_page_zero_invalid(m, TRUE); } next_page:; } end_pages: redo = false; for (i = 0; i < count; i++) { if (ma[i] == bogus_page) continue; if (vm_page_busy_tryupgrade(ma[i]) == 0) { vm_page_sunbusy(ma[i]); ma[i] = vm_page_grab_unlocked(object, ma[i]->pindex, VM_ALLOC_NORMAL); } /* * Since the pages were only sbusy while neither the * buffer nor the object lock was held by us, or * reallocated while vm_page_grab() slept for busy * relinguish, they could have been invalidated. * Recheck the valid bits and re-read as needed. * * Note that the last page is made fully valid in the * read loop, and partial validity for the page at * index count - 1 could mean that the page was * invalidated or removed, so we must restart for * safety as well. */ if (!vm_page_all_valid(ma[i])) redo = true; } if (redo && error == 0) goto again; return (error != 0 ? VM_PAGER_ERROR : VM_PAGER_OK); } #include "opt_ddb.h" #ifdef DDB #include /* DDB command to show buffer data */ DB_SHOW_COMMAND(buffer, db_show_buffer) { /* get args */ struct buf *bp = (struct buf *)addr; #ifdef FULL_BUF_TRACKING uint32_t i, j; #endif if (!have_addr) { db_printf("usage: show buffer \n"); return; } db_printf("buf at %p\n", bp); db_printf("b_flags = 0x%b, b_xflags=0x%b\n", (u_int)bp->b_flags, PRINT_BUF_FLAGS, (u_int)bp->b_xflags, PRINT_BUF_XFLAGS); db_printf("b_vflags=0x%b b_ioflags0x%b\n", (u_int)bp->b_vflags, PRINT_BUF_VFLAGS, (u_int)bp->b_ioflags, PRINT_BIO_FLAGS); db_printf( "b_error = %d, b_bufsize = %ld, b_bcount = %ld, b_resid = %ld\n" "b_bufobj = (%p), b_data = %p\n, b_blkno = %jd, b_lblkno = %jd, " "b_vp = %p, b_dep = %p\n", bp->b_error, bp->b_bufsize, bp->b_bcount, bp->b_resid, bp->b_bufobj, bp->b_data, (intmax_t)bp->b_blkno, (intmax_t)bp->b_lblkno, bp->b_vp, bp->b_dep.lh_first); db_printf("b_kvabase = %p, b_kvasize = %d\n", bp->b_kvabase, bp->b_kvasize); if (bp->b_npages) { int i; db_printf("b_npages = %d, pages(OBJ, IDX, PA): ", bp->b_npages); for (i = 0; i < bp->b_npages; i++) { vm_page_t m; m = bp->b_pages[i]; if (m != NULL) db_printf("(%p, 0x%lx, 0x%lx)", m->object, (u_long)m->pindex, (u_long)VM_PAGE_TO_PHYS(m)); else db_printf("( ??? )"); if ((i + 1) < bp->b_npages) db_printf(","); } db_printf("\n"); } BUF_LOCKPRINTINFO(bp); #if defined(FULL_BUF_TRACKING) db_printf("b_io_tracking: b_io_tcnt = %u\n", bp->b_io_tcnt); i = bp->b_io_tcnt % BUF_TRACKING_SIZE; for (j = 1; j <= BUF_TRACKING_SIZE; j++) { if (bp->b_io_tracking[BUF_TRACKING_ENTRY(i - j)] == NULL) continue; db_printf(" %2u: %s\n", j, bp->b_io_tracking[BUF_TRACKING_ENTRY(i - j)]); } #elif defined(BUF_TRACKING) db_printf("b_io_tracking: %s\n", bp->b_io_tracking); #endif db_printf(" "); } DB_SHOW_COMMAND(bufqueues, bufqueues) { struct bufdomain *bd; struct buf *bp; long total; int i, j, cnt; db_printf("bqempty: %d\n", bqempty.bq_len); for (i = 0; i < buf_domains; i++) { bd = &bdomain[i]; db_printf("Buf domain %d\n", i); db_printf("\tfreebufs\t%d\n", bd->bd_freebuffers); db_printf("\tlofreebufs\t%d\n", bd->bd_lofreebuffers); db_printf("\thifreebufs\t%d\n", bd->bd_hifreebuffers); db_printf("\n"); db_printf("\tbufspace\t%ld\n", bd->bd_bufspace); db_printf("\tmaxbufspace\t%ld\n", bd->bd_maxbufspace); db_printf("\thibufspace\t%ld\n", bd->bd_hibufspace); db_printf("\tlobufspace\t%ld\n", bd->bd_lobufspace); db_printf("\tbufspacethresh\t%ld\n", bd->bd_bufspacethresh); db_printf("\n"); db_printf("\tnumdirtybuffers\t%d\n", bd->bd_numdirtybuffers); db_printf("\tlodirtybuffers\t%d\n", bd->bd_lodirtybuffers); db_printf("\thidirtybuffers\t%d\n", bd->bd_hidirtybuffers); db_printf("\tdirtybufthresh\t%d\n", bd->bd_dirtybufthresh); db_printf("\n"); total = 0; TAILQ_FOREACH(bp, &bd->bd_cleanq->bq_queue, b_freelist) total += bp->b_bufsize; db_printf("\tcleanq count\t%d (%ld)\n", bd->bd_cleanq->bq_len, total); total = 0; TAILQ_FOREACH(bp, &bd->bd_dirtyq.bq_queue, b_freelist) total += bp->b_bufsize; db_printf("\tdirtyq count\t%d (%ld)\n", bd->bd_dirtyq.bq_len, total); db_printf("\twakeup\t\t%d\n", bd->bd_wanted); db_printf("\tlim\t\t%d\n", bd->bd_lim); db_printf("\tCPU "); for (j = 0; j <= mp_maxid; j++) db_printf("%d, ", bd->bd_subq[j].bq_len); db_printf("\n"); cnt = 0; total = 0; for (j = 0; j < nbuf; j++) if (buf[j].b_domain == i && BUF_ISLOCKED(&buf[j])) { cnt++; total += buf[j].b_bufsize; } db_printf("\tLocked buffers: %d space %ld\n", cnt, total); cnt = 0; total = 0; for (j = 0; j < nbuf; j++) if (buf[j].b_domain == i) { cnt++; total += buf[j].b_bufsize; } db_printf("\tTotal buffers: %d space %ld\n", cnt, total); } } DB_SHOW_COMMAND(lockedbufs, lockedbufs) { struct buf *bp; int i; for (i = 0; i < nbuf; i++) { bp = &buf[i]; if (BUF_ISLOCKED(bp)) { db_show_buffer((uintptr_t)bp, 1, 0, NULL); db_printf("\n"); if (db_pager_quit) break; } } } DB_SHOW_COMMAND(vnodebufs, db_show_vnodebufs) { struct vnode *vp; struct buf *bp; if (!have_addr) { db_printf("usage: show vnodebufs \n"); return; } vp = (struct vnode *)addr; db_printf("Clean buffers:\n"); TAILQ_FOREACH(bp, &vp->v_bufobj.bo_clean.bv_hd, b_bobufs) { db_show_buffer((uintptr_t)bp, 1, 0, NULL); db_printf("\n"); } db_printf("Dirty buffers:\n"); TAILQ_FOREACH(bp, &vp->v_bufobj.bo_dirty.bv_hd, b_bobufs) { db_show_buffer((uintptr_t)bp, 1, 0, NULL); db_printf("\n"); } } DB_COMMAND(countfreebufs, db_coundfreebufs) { struct buf *bp; int i, used = 0, nfree = 0; if (have_addr) { db_printf("usage: countfreebufs\n"); return; } for (i = 0; i < nbuf; i++) { bp = &buf[i]; if (bp->b_qindex == QUEUE_EMPTY) nfree++; else used++; } db_printf("Counted %d free, %d used (%d tot)\n", nfree, used, nfree + used); db_printf("numfreebuffers is %d\n", numfreebuffers); } #endif /* DDB */ Index: head/sys/kern/vfs_subr.c =================================================================== --- head/sys/kern/vfs_subr.c (revision 364371) +++ head/sys/kern/vfs_subr.c (revision 364372) @@ -1,6672 +1,6679 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1989, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)vfs_subr.c 8.31 (Berkeley) 5/26/95 */ /* * External virtual filesystem routines */ #include __FBSDID("$FreeBSD$"); #include "opt_ddb.h" #include "opt_watchdog.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef DDB #include #endif static void delmntque(struct vnode *vp); static int flushbuflist(struct bufv *bufv, int flags, struct bufobj *bo, int slpflag, int slptimeo); static void syncer_shutdown(void *arg, int howto); static int vtryrecycle(struct vnode *vp); static void v_init_counters(struct vnode *); static void vgonel(struct vnode *); static void vfs_knllock(void *arg); static void vfs_knlunlock(void *arg); static void vfs_knl_assert_locked(void *arg); static void vfs_knl_assert_unlocked(void *arg); static void destroy_vpollinfo(struct vpollinfo *vi); static int v_inval_buf_range_locked(struct vnode *vp, struct bufobj *bo, daddr_t startlbn, daddr_t endlbn); static void vnlru_recalc(void); /* * These fences are intended for cases where some synchronization is * needed between access of v_iflags and lockless vnode refcount (v_holdcnt * and v_usecount) updates. Access to v_iflags is generally synchronized * by the interlock, but we have some internal assertions that check vnode * flags without acquiring the lock. Thus, these fences are INVARIANTS-only * for now. */ #ifdef INVARIANTS #define VNODE_REFCOUNT_FENCE_ACQ() atomic_thread_fence_acq() #define VNODE_REFCOUNT_FENCE_REL() atomic_thread_fence_rel() #else #define VNODE_REFCOUNT_FENCE_ACQ() #define VNODE_REFCOUNT_FENCE_REL() #endif /* * Number of vnodes in existence. Increased whenever getnewvnode() * allocates a new vnode, decreased in vdropl() for VIRF_DOOMED vnode. */ static u_long __exclusive_cache_line numvnodes; SYSCTL_ULONG(_vfs, OID_AUTO, numvnodes, CTLFLAG_RD, &numvnodes, 0, "Number of vnodes in existence"); static counter_u64_t vnodes_created; SYSCTL_COUNTER_U64(_vfs, OID_AUTO, vnodes_created, CTLFLAG_RD, &vnodes_created, "Number of vnodes created by getnewvnode"); /* * Conversion tables for conversion from vnode types to inode formats * and back. */ enum vtype iftovt_tab[16] = { VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON, VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VNON }; int vttoif_tab[10] = { 0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK, S_IFSOCK, S_IFIFO, S_IFMT, S_IFMT }; /* * List of allocates vnodes in the system. */ static TAILQ_HEAD(freelst, vnode) vnode_list; static struct vnode *vnode_list_free_marker; static struct vnode *vnode_list_reclaim_marker; /* * "Free" vnode target. Free vnodes are rarely completely free, but are * just ones that are cheap to recycle. Usually they are for files which * have been stat'd but not read; these usually have inode and namecache * data attached to them. This target is the preferred minimum size of a * sub-cache consisting mostly of such files. The system balances the size * of this sub-cache with its complement to try to prevent either from * thrashing while the other is relatively inactive. The targets express * a preference for the best balance. * * "Above" this target there are 2 further targets (watermarks) related * to recyling of free vnodes. In the best-operating case, the cache is * exactly full, the free list has size between vlowat and vhiwat above the * free target, and recycling from it and normal use maintains this state. * Sometimes the free list is below vlowat or even empty, but this state * is even better for immediate use provided the cache is not full. * Otherwise, vnlru_proc() runs to reclaim enough vnodes (usually non-free * ones) to reach one of these states. The watermarks are currently hard- * coded as 4% and 9% of the available space higher. These and the default * of 25% for wantfreevnodes are too large if the memory size is large. * E.g., 9% of 75% of MAXVNODES is more than 566000 vnodes to reclaim * whenever vnlru_proc() becomes active. */ static long wantfreevnodes; static long __exclusive_cache_line freevnodes; SYSCTL_ULONG(_vfs, OID_AUTO, freevnodes, CTLFLAG_RD, &freevnodes, 0, "Number of \"free\" vnodes"); static long freevnodes_old; static counter_u64_t recycles_count; SYSCTL_COUNTER_U64(_vfs, OID_AUTO, recycles, CTLFLAG_RD, &recycles_count, "Number of vnodes recycled to meet vnode cache targets"); static counter_u64_t recycles_free_count; SYSCTL_COUNTER_U64(_vfs, OID_AUTO, recycles_free, CTLFLAG_RD, &recycles_free_count, "Number of free vnodes recycled to meet vnode cache targets"); static counter_u64_t deferred_inact; SYSCTL_COUNTER_U64(_vfs, OID_AUTO, deferred_inact, CTLFLAG_RD, &deferred_inact, "Number of times inactive processing was deferred"); /* To keep more than one thread at a time from running vfs_getnewfsid */ static struct mtx mntid_mtx; /* * Lock for any access to the following: * vnode_list * numvnodes * freevnodes */ static struct mtx __exclusive_cache_line vnode_list_mtx; /* Publicly exported FS */ struct nfs_public nfs_pub; static uma_zone_t buf_trie_zone; static smr_t buf_trie_smr; /* Zone for allocation of new vnodes - used exclusively by getnewvnode() */ static uma_zone_t vnode_zone; static uma_zone_t vnodepoll_zone; __read_frequently smr_t vfs_smr; /* * The workitem queue. * * It is useful to delay writes of file data and filesystem metadata * for tens of seconds so that quickly created and deleted files need * not waste disk bandwidth being created and removed. To realize this, * we append vnodes to a "workitem" queue. When running with a soft * updates implementation, most pending metadata dependencies should * not wait for more than a few seconds. Thus, mounted on block devices * are delayed only about a half the time that file data is delayed. * Similarly, directory updates are more critical, so are only delayed * about a third the time that file data is delayed. Thus, there are * SYNCER_MAXDELAY queues that are processed round-robin at a rate of * one each second (driven off the filesystem syncer process). The * syncer_delayno variable indicates the next queue that is to be processed. * Items that need to be processed soon are placed in this queue: * * syncer_workitem_pending[syncer_delayno] * * A delay of fifteen seconds is done by placing the request fifteen * entries later in the queue: * * syncer_workitem_pending[(syncer_delayno + 15) & syncer_mask] * */ static int syncer_delayno; static long syncer_mask; LIST_HEAD(synclist, bufobj); static struct synclist *syncer_workitem_pending; /* * The sync_mtx protects: * bo->bo_synclist * sync_vnode_count * syncer_delayno * syncer_state * syncer_workitem_pending * syncer_worklist_len * rushjob */ static struct mtx sync_mtx; static struct cv sync_wakeup; #define SYNCER_MAXDELAY 32 static int syncer_maxdelay = SYNCER_MAXDELAY; /* maximum delay time */ static int syncdelay = 30; /* max time to delay syncing data */ static int filedelay = 30; /* time to delay syncing files */ SYSCTL_INT(_kern, OID_AUTO, filedelay, CTLFLAG_RW, &filedelay, 0, "Time to delay syncing files (in seconds)"); static int dirdelay = 29; /* time to delay syncing directories */ SYSCTL_INT(_kern, OID_AUTO, dirdelay, CTLFLAG_RW, &dirdelay, 0, "Time to delay syncing directories (in seconds)"); static int metadelay = 28; /* time to delay syncing metadata */ SYSCTL_INT(_kern, OID_AUTO, metadelay, CTLFLAG_RW, &metadelay, 0, "Time to delay syncing metadata (in seconds)"); static int rushjob; /* number of slots to run ASAP */ static int stat_rush_requests; /* number of times I/O speeded up */ SYSCTL_INT(_debug, OID_AUTO, rush_requests, CTLFLAG_RW, &stat_rush_requests, 0, "Number of times I/O speeded up (rush requests)"); #define VDBATCH_SIZE 8 struct vdbatch { u_int index; long freevnodes; struct mtx lock; struct vnode *tab[VDBATCH_SIZE]; }; DPCPU_DEFINE_STATIC(struct vdbatch, vd); static void vdbatch_dequeue(struct vnode *vp); /* * When shutting down the syncer, run it at four times normal speed. */ #define SYNCER_SHUTDOWN_SPEEDUP 4 static int sync_vnode_count; static int syncer_worklist_len; static enum { SYNCER_RUNNING, SYNCER_SHUTTING_DOWN, SYNCER_FINAL_DELAY } syncer_state; /* Target for maximum number of vnodes. */ u_long desiredvnodes; static u_long gapvnodes; /* gap between wanted and desired */ static u_long vhiwat; /* enough extras after expansion */ static u_long vlowat; /* minimal extras before expansion */ static u_long vstir; /* nonzero to stir non-free vnodes */ static volatile int vsmalltrigger = 8; /* pref to keep if > this many pages */ static u_long vnlru_read_freevnodes(void); /* * Note that no attempt is made to sanitize these parameters. */ static int sysctl_maxvnodes(SYSCTL_HANDLER_ARGS) { u_long val; int error; val = desiredvnodes; error = sysctl_handle_long(oidp, &val, 0, req); if (error != 0 || req->newptr == NULL) return (error); if (val == desiredvnodes) return (0); mtx_lock(&vnode_list_mtx); desiredvnodes = val; wantfreevnodes = desiredvnodes / 4; vnlru_recalc(); mtx_unlock(&vnode_list_mtx); /* * XXX There is no protection against multiple threads changing * desiredvnodes at the same time. Locking above only helps vnlru and * getnewvnode. */ vfs_hash_changesize(desiredvnodes); cache_changesize(desiredvnodes); return (0); } SYSCTL_PROC(_kern, KERN_MAXVNODES, maxvnodes, CTLTYPE_ULONG | CTLFLAG_MPSAFE | CTLFLAG_RW, NULL, 0, sysctl_maxvnodes, "LU", "Target for maximum number of vnodes"); static int sysctl_wantfreevnodes(SYSCTL_HANDLER_ARGS) { u_long val; int error; val = wantfreevnodes; error = sysctl_handle_long(oidp, &val, 0, req); if (error != 0 || req->newptr == NULL) return (error); if (val == wantfreevnodes) return (0); mtx_lock(&vnode_list_mtx); wantfreevnodes = val; vnlru_recalc(); mtx_unlock(&vnode_list_mtx); return (0); } SYSCTL_PROC(_vfs, OID_AUTO, wantfreevnodes, CTLTYPE_ULONG | CTLFLAG_MPSAFE | CTLFLAG_RW, NULL, 0, sysctl_wantfreevnodes, "LU", "Target for minimum number of \"free\" vnodes"); SYSCTL_ULONG(_kern, OID_AUTO, minvnodes, CTLFLAG_RW, &wantfreevnodes, 0, "Old name for vfs.wantfreevnodes (legacy)"); static int vnlru_nowhere; SYSCTL_INT(_debug, OID_AUTO, vnlru_nowhere, CTLFLAG_RW, &vnlru_nowhere, 0, "Number of times the vnlru process ran without success"); static int sysctl_try_reclaim_vnode(SYSCTL_HANDLER_ARGS) { struct vnode *vp; struct nameidata nd; char *buf; unsigned long ndflags; int error; if (req->newptr == NULL) return (EINVAL); if (req->newlen >= PATH_MAX) return (E2BIG); buf = malloc(PATH_MAX, M_TEMP, M_WAITOK); error = SYSCTL_IN(req, buf, req->newlen); if (error != 0) goto out; buf[req->newlen] = '\0'; ndflags = LOCKLEAF | NOFOLLOW | AUDITVNODE1 | SAVENAME; NDINIT(&nd, LOOKUP, ndflags, UIO_SYSSPACE, buf, curthread); if ((error = namei(&nd)) != 0) goto out; vp = nd.ni_vp; if (VN_IS_DOOMED(vp)) { /* * This vnode is being recycled. Return != 0 to let the caller * know that the sysctl had no effect. Return EAGAIN because a * subsequent call will likely succeed (since namei will create * a new vnode if necessary) */ error = EAGAIN; goto putvnode; } counter_u64_add(recycles_count, 1); vgone(vp); putvnode: NDFREE(&nd, 0); out: free(buf, M_TEMP); return (error); } static int sysctl_ftry_reclaim_vnode(SYSCTL_HANDLER_ARGS) { struct thread *td = curthread; struct vnode *vp; struct file *fp; int error; int fd; if (req->newptr == NULL) return (EBADF); error = sysctl_handle_int(oidp, &fd, 0, req); if (error != 0) return (error); error = getvnode(curthread, fd, &cap_fcntl_rights, &fp); if (error != 0) return (error); vp = fp->f_vnode; error = vn_lock(vp, LK_EXCLUSIVE); if (error != 0) goto drop; counter_u64_add(recycles_count, 1); vgone(vp); VOP_UNLOCK(vp); drop: fdrop(fp, td); return (error); } SYSCTL_PROC(_debug, OID_AUTO, try_reclaim_vnode, CTLTYPE_STRING | CTLFLAG_MPSAFE | CTLFLAG_WR, NULL, 0, sysctl_try_reclaim_vnode, "A", "Try to reclaim a vnode by its pathname"); SYSCTL_PROC(_debug, OID_AUTO, ftry_reclaim_vnode, CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_WR, NULL, 0, sysctl_ftry_reclaim_vnode, "I", "Try to reclaim a vnode by its file descriptor"); /* Shift count for (uintptr_t)vp to initialize vp->v_hash. */ static int vnsz2log; /* * Support for the bufobj clean & dirty pctrie. */ static void * buf_trie_alloc(struct pctrie *ptree) { return (uma_zalloc_smr(buf_trie_zone, M_NOWAIT)); } static void buf_trie_free(struct pctrie *ptree, void *node) { uma_zfree_smr(buf_trie_zone, node); } PCTRIE_DEFINE_SMR(BUF, buf, b_lblkno, buf_trie_alloc, buf_trie_free, buf_trie_smr); /* * Initialize the vnode management data structures. * * Reevaluate the following cap on the number of vnodes after the physical * memory size exceeds 512GB. In the limit, as the physical memory size * grows, the ratio of the memory size in KB to vnodes approaches 64:1. */ #ifndef MAXVNODES_MAX #define MAXVNODES_MAX (512UL * 1024 * 1024 / 64) /* 8M */ #endif static MALLOC_DEFINE(M_VNODE_MARKER, "vnodemarker", "vnode marker"); static struct vnode * vn_alloc_marker(struct mount *mp) { struct vnode *vp; vp = malloc(sizeof(struct vnode), M_VNODE_MARKER, M_WAITOK | M_ZERO); vp->v_type = VMARKER; vp->v_mount = mp; return (vp); } static void vn_free_marker(struct vnode *vp) { MPASS(vp->v_type == VMARKER); free(vp, M_VNODE_MARKER); } /* * Initialize a vnode as it first enters the zone. */ static int vnode_init(void *mem, int size, int flags) { struct vnode *vp; vp = mem; bzero(vp, size); /* * Setup locks. */ vp->v_vnlock = &vp->v_lock; mtx_init(&vp->v_interlock, "vnode interlock", NULL, MTX_DEF); /* * By default, don't allow shared locks unless filesystems opt-in. */ lockinit(vp->v_vnlock, PVFS, "vnode", VLKTIMEOUT, LK_NOSHARE | LK_IS_VNODE); /* * Initialize bufobj. */ bufobj_init(&vp->v_bufobj, vp); /* * Initialize namecache. */ cache_vnode_init(vp); /* * Initialize rangelocks. */ rangelock_init(&vp->v_rl); vp->v_dbatchcpu = NOCPU; mtx_lock(&vnode_list_mtx); TAILQ_INSERT_BEFORE(vnode_list_free_marker, vp, v_vnodelist); mtx_unlock(&vnode_list_mtx); return (0); } /* * Free a vnode when it is cleared from the zone. */ static void vnode_fini(void *mem, int size) { struct vnode *vp; struct bufobj *bo; vp = mem; vdbatch_dequeue(vp); mtx_lock(&vnode_list_mtx); TAILQ_REMOVE(&vnode_list, vp, v_vnodelist); mtx_unlock(&vnode_list_mtx); rangelock_destroy(&vp->v_rl); lockdestroy(vp->v_vnlock); mtx_destroy(&vp->v_interlock); bo = &vp->v_bufobj; rw_destroy(BO_LOCKPTR(bo)); } /* * Provide the size of NFS nclnode and NFS fh for calculation of the * vnode memory consumption. The size is specified directly to * eliminate dependency on NFS-private header. * * Other filesystems may use bigger or smaller (like UFS and ZFS) * private inode data, but the NFS-based estimation is ample enough. * Still, we care about differences in the size between 64- and 32-bit * platforms. * * Namecache structure size is heuristically * sizeof(struct namecache_ts) + CACHE_PATH_CUTOFF + 1. */ #ifdef _LP64 #define NFS_NCLNODE_SZ (528 + 64) #define NC_SZ 148 #else #define NFS_NCLNODE_SZ (360 + 32) #define NC_SZ 92 #endif static void vntblinit(void *dummy __unused) { struct vdbatch *vd; int cpu, physvnodes, virtvnodes; u_int i; /* * Desiredvnodes is a function of the physical memory size and the * kernel's heap size. Generally speaking, it scales with the * physical memory size. The ratio of desiredvnodes to the physical * memory size is 1:16 until desiredvnodes exceeds 98,304. * Thereafter, the * marginal ratio of desiredvnodes to the physical memory size is * 1:64. However, desiredvnodes is limited by the kernel's heap * size. The memory required by desiredvnodes vnodes and vm objects * must not exceed 1/10th of the kernel's heap size. */ physvnodes = maxproc + pgtok(vm_cnt.v_page_count) / 64 + 3 * min(98304 * 16, pgtok(vm_cnt.v_page_count)) / 64; virtvnodes = vm_kmem_size / (10 * (sizeof(struct vm_object) + sizeof(struct vnode) + NC_SZ * ncsizefactor + NFS_NCLNODE_SZ)); desiredvnodes = min(physvnodes, virtvnodes); if (desiredvnodes > MAXVNODES_MAX) { if (bootverbose) printf("Reducing kern.maxvnodes %lu -> %lu\n", desiredvnodes, MAXVNODES_MAX); desiredvnodes = MAXVNODES_MAX; } wantfreevnodes = desiredvnodes / 4; mtx_init(&mntid_mtx, "mntid", NULL, MTX_DEF); TAILQ_INIT(&vnode_list); mtx_init(&vnode_list_mtx, "vnode_list", NULL, MTX_DEF); /* * The lock is taken to appease WITNESS. */ mtx_lock(&vnode_list_mtx); vnlru_recalc(); mtx_unlock(&vnode_list_mtx); vnode_list_free_marker = vn_alloc_marker(NULL); TAILQ_INSERT_HEAD(&vnode_list, vnode_list_free_marker, v_vnodelist); vnode_list_reclaim_marker = vn_alloc_marker(NULL); TAILQ_INSERT_HEAD(&vnode_list, vnode_list_reclaim_marker, v_vnodelist); vnode_zone = uma_zcreate("VNODE", sizeof (struct vnode), NULL, NULL, vnode_init, vnode_fini, UMA_ALIGN_PTR, 0); uma_zone_set_smr(vnode_zone, vfs_smr); vnodepoll_zone = uma_zcreate("VNODEPOLL", sizeof (struct vpollinfo), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); /* * Preallocate enough nodes to support one-per buf so that * we can not fail an insert. reassignbuf() callers can not * tolerate the insertion failure. */ buf_trie_zone = uma_zcreate("BUF TRIE", pctrie_node_size(), NULL, NULL, pctrie_zone_init, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE | UMA_ZONE_SMR); buf_trie_smr = uma_zone_get_smr(buf_trie_zone); uma_prealloc(buf_trie_zone, nbuf); vnodes_created = counter_u64_alloc(M_WAITOK); recycles_count = counter_u64_alloc(M_WAITOK); recycles_free_count = counter_u64_alloc(M_WAITOK); deferred_inact = counter_u64_alloc(M_WAITOK); /* * Initialize the filesystem syncer. */ syncer_workitem_pending = hashinit(syncer_maxdelay, M_VNODE, &syncer_mask); syncer_maxdelay = syncer_mask + 1; mtx_init(&sync_mtx, "Syncer mtx", NULL, MTX_DEF); cv_init(&sync_wakeup, "syncer"); for (i = 1; i <= sizeof(struct vnode); i <<= 1) vnsz2log++; vnsz2log--; CPU_FOREACH(cpu) { vd = DPCPU_ID_PTR((cpu), vd); bzero(vd, sizeof(*vd)); mtx_init(&vd->lock, "vdbatch", NULL, MTX_DEF); } } SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_FIRST, vntblinit, NULL); /* * Mark a mount point as busy. Used to synchronize access and to delay * unmounting. Eventually, mountlist_mtx is not released on failure. * * vfs_busy() is a custom lock, it can block the caller. * vfs_busy() only sleeps if the unmount is active on the mount point. * For a mountpoint mp, vfs_busy-enforced lock is before lock of any * vnode belonging to mp. * * Lookup uses vfs_busy() to traverse mount points. * root fs var fs * / vnode lock A / vnode lock (/var) D * /var vnode lock B /log vnode lock(/var/log) E * vfs_busy lock C vfs_busy lock F * * Within each file system, the lock order is C->A->B and F->D->E. * * When traversing across mounts, the system follows that lock order: * * C->A->B * | * +->F->D->E * * The lookup() process for namei("/var") illustrates the process: * VOP_LOOKUP() obtains B while A is held * vfs_busy() obtains a shared lock on F while A and B are held * vput() releases lock on B * vput() releases lock on A * VFS_ROOT() obtains lock on D while shared lock on F is held * vfs_unbusy() releases shared lock on F * vn_lock() obtains lock on deadfs vnode vp_crossmp instead of A. * Attempt to lock A (instead of vp_crossmp) while D is held would * violate the global order, causing deadlocks. * * dounmount() locks B while F is drained. */ int vfs_busy(struct mount *mp, int flags) { MPASS((flags & ~MBF_MASK) == 0); CTR3(KTR_VFS, "%s: mp %p with flags %d", __func__, mp, flags); if (vfs_op_thread_enter(mp)) { MPASS((mp->mnt_kern_flag & MNTK_DRAINING) == 0); MPASS((mp->mnt_kern_flag & MNTK_UNMOUNT) == 0); MPASS((mp->mnt_kern_flag & MNTK_REFEXPIRE) == 0); vfs_mp_count_add_pcpu(mp, ref, 1); vfs_mp_count_add_pcpu(mp, lockref, 1); vfs_op_thread_exit(mp); if (flags & MBF_MNTLSTLOCK) mtx_unlock(&mountlist_mtx); return (0); } MNT_ILOCK(mp); vfs_assert_mount_counters(mp); MNT_REF(mp); /* * If mount point is currently being unmounted, sleep until the * mount point fate is decided. If thread doing the unmounting fails, * it will clear MNTK_UNMOUNT flag before waking us up, indicating * that this mount point has survived the unmount attempt and vfs_busy * should retry. Otherwise the unmounter thread will set MNTK_REFEXPIRE * flag in addition to MNTK_UNMOUNT, indicating that mount point is * about to be really destroyed. vfs_busy needs to release its * reference on the mount point in this case and return with ENOENT, * telling the caller that mount mount it tried to busy is no longer * valid. */ while (mp->mnt_kern_flag & MNTK_UNMOUNT) { if (flags & MBF_NOWAIT || mp->mnt_kern_flag & MNTK_REFEXPIRE) { MNT_REL(mp); MNT_IUNLOCK(mp); CTR1(KTR_VFS, "%s: failed busying before sleeping", __func__); return (ENOENT); } if (flags & MBF_MNTLSTLOCK) mtx_unlock(&mountlist_mtx); mp->mnt_kern_flag |= MNTK_MWAIT; msleep(mp, MNT_MTX(mp), PVFS | PDROP, "vfs_busy", 0); if (flags & MBF_MNTLSTLOCK) mtx_lock(&mountlist_mtx); MNT_ILOCK(mp); } if (flags & MBF_MNTLSTLOCK) mtx_unlock(&mountlist_mtx); mp->mnt_lockref++; MNT_IUNLOCK(mp); return (0); } /* * Free a busy filesystem. */ void vfs_unbusy(struct mount *mp) { int c; CTR2(KTR_VFS, "%s: mp %p", __func__, mp); if (vfs_op_thread_enter(mp)) { MPASS((mp->mnt_kern_flag & MNTK_DRAINING) == 0); vfs_mp_count_sub_pcpu(mp, lockref, 1); vfs_mp_count_sub_pcpu(mp, ref, 1); vfs_op_thread_exit(mp); return; } MNT_ILOCK(mp); vfs_assert_mount_counters(mp); MNT_REL(mp); c = --mp->mnt_lockref; if (mp->mnt_vfs_ops == 0) { MPASS((mp->mnt_kern_flag & MNTK_DRAINING) == 0); MNT_IUNLOCK(mp); return; } if (c < 0) vfs_dump_mount_counters(mp); if (c == 0 && (mp->mnt_kern_flag & MNTK_DRAINING) != 0) { MPASS(mp->mnt_kern_flag & MNTK_UNMOUNT); CTR1(KTR_VFS, "%s: waking up waiters", __func__); mp->mnt_kern_flag &= ~MNTK_DRAINING; wakeup(&mp->mnt_lockref); } MNT_IUNLOCK(mp); } /* * Lookup a mount point by filesystem identifier. */ struct mount * vfs_getvfs(fsid_t *fsid) { struct mount *mp; CTR2(KTR_VFS, "%s: fsid %p", __func__, fsid); mtx_lock(&mountlist_mtx); TAILQ_FOREACH(mp, &mountlist, mnt_list) { if (fsidcmp(&mp->mnt_stat.f_fsid, fsid) == 0) { vfs_ref(mp); mtx_unlock(&mountlist_mtx); return (mp); } } mtx_unlock(&mountlist_mtx); CTR2(KTR_VFS, "%s: lookup failed for %p id", __func__, fsid); return ((struct mount *) 0); } /* * Lookup a mount point by filesystem identifier, busying it before * returning. * * To avoid congestion on mountlist_mtx, implement simple direct-mapped * cache for popular filesystem identifiers. The cache is lockess, using * the fact that struct mount's are never freed. In worst case we may * get pointer to unmounted or even different filesystem, so we have to * check what we got, and go slow way if so. */ struct mount * vfs_busyfs(fsid_t *fsid) { #define FSID_CACHE_SIZE 256 typedef struct mount * volatile vmp_t; static vmp_t cache[FSID_CACHE_SIZE]; struct mount *mp; int error; uint32_t hash; CTR2(KTR_VFS, "%s: fsid %p", __func__, fsid); hash = fsid->val[0] ^ fsid->val[1]; hash = (hash >> 16 ^ hash) & (FSID_CACHE_SIZE - 1); mp = cache[hash]; if (mp == NULL || fsidcmp(&mp->mnt_stat.f_fsid, fsid) != 0) goto slow; if (vfs_busy(mp, 0) != 0) { cache[hash] = NULL; goto slow; } if (fsidcmp(&mp->mnt_stat.f_fsid, fsid) == 0) return (mp); else vfs_unbusy(mp); slow: mtx_lock(&mountlist_mtx); TAILQ_FOREACH(mp, &mountlist, mnt_list) { if (fsidcmp(&mp->mnt_stat.f_fsid, fsid) == 0) { error = vfs_busy(mp, MBF_MNTLSTLOCK); if (error) { cache[hash] = NULL; mtx_unlock(&mountlist_mtx); return (NULL); } cache[hash] = mp; return (mp); } } CTR2(KTR_VFS, "%s: lookup failed for %p id", __func__, fsid); mtx_unlock(&mountlist_mtx); return ((struct mount *) 0); } /* * Check if a user can access privileged mount options. */ int vfs_suser(struct mount *mp, struct thread *td) { int error; if (jailed(td->td_ucred)) { /* * If the jail of the calling thread lacks permission for * this type of file system, deny immediately. */ if (!prison_allow(td->td_ucred, mp->mnt_vfc->vfc_prison_flag)) return (EPERM); /* * If the file system was mounted outside the jail of the * calling thread, deny immediately. */ if (prison_check(td->td_ucred, mp->mnt_cred) != 0) return (EPERM); } /* * If file system supports delegated administration, we don't check * for the PRIV_VFS_MOUNT_OWNER privilege - it will be better verified * by the file system itself. * If this is not the user that did original mount, we check for * the PRIV_VFS_MOUNT_OWNER privilege. */ if (!(mp->mnt_vfc->vfc_flags & VFCF_DELEGADMIN) && mp->mnt_cred->cr_uid != td->td_ucred->cr_uid) { if ((error = priv_check(td, PRIV_VFS_MOUNT_OWNER)) != 0) return (error); } return (0); } /* * Get a new unique fsid. Try to make its val[0] unique, since this value * will be used to create fake device numbers for stat(). Also try (but * not so hard) make its val[0] unique mod 2^16, since some emulators only * support 16-bit device numbers. We end up with unique val[0]'s for the * first 2^16 calls and unique val[0]'s mod 2^16 for the first 2^8 calls. * * Keep in mind that several mounts may be running in parallel. Starting * the search one past where the previous search terminated is both a * micro-optimization and a defense against returning the same fsid to * different mounts. */ void vfs_getnewfsid(struct mount *mp) { static uint16_t mntid_base; struct mount *nmp; fsid_t tfsid; int mtype; CTR2(KTR_VFS, "%s: mp %p", __func__, mp); mtx_lock(&mntid_mtx); mtype = mp->mnt_vfc->vfc_typenum; tfsid.val[1] = mtype; mtype = (mtype & 0xFF) << 24; for (;;) { tfsid.val[0] = makedev(255, mtype | ((mntid_base & 0xFF00) << 8) | (mntid_base & 0xFF)); mntid_base++; if ((nmp = vfs_getvfs(&tfsid)) == NULL) break; vfs_rel(nmp); } mp->mnt_stat.f_fsid.val[0] = tfsid.val[0]; mp->mnt_stat.f_fsid.val[1] = tfsid.val[1]; mtx_unlock(&mntid_mtx); } /* * Knob to control the precision of file timestamps: * * 0 = seconds only; nanoseconds zeroed. * 1 = seconds and nanoseconds, accurate within 1/HZ. * 2 = seconds and nanoseconds, truncated to microseconds. * >=3 = seconds and nanoseconds, maximum precision. */ enum { TSP_SEC, TSP_HZ, TSP_USEC, TSP_NSEC }; static int timestamp_precision = TSP_USEC; SYSCTL_INT(_vfs, OID_AUTO, timestamp_precision, CTLFLAG_RW, ×tamp_precision, 0, "File timestamp precision (0: seconds, " "1: sec + ns accurate to 1/HZ, 2: sec + ns truncated to us, " "3+: sec + ns (max. precision))"); /* * Get a current timestamp. */ void vfs_timestamp(struct timespec *tsp) { struct timeval tv; switch (timestamp_precision) { case TSP_SEC: tsp->tv_sec = time_second; tsp->tv_nsec = 0; break; case TSP_HZ: getnanotime(tsp); break; case TSP_USEC: microtime(&tv); TIMEVAL_TO_TIMESPEC(&tv, tsp); break; case TSP_NSEC: default: nanotime(tsp); break; } } /* * Set vnode attributes to VNOVAL */ void vattr_null(struct vattr *vap) { vap->va_type = VNON; vap->va_size = VNOVAL; vap->va_bytes = VNOVAL; vap->va_mode = VNOVAL; vap->va_nlink = VNOVAL; vap->va_uid = VNOVAL; vap->va_gid = VNOVAL; vap->va_fsid = VNOVAL; vap->va_fileid = VNOVAL; vap->va_blocksize = VNOVAL; vap->va_rdev = VNOVAL; vap->va_atime.tv_sec = VNOVAL; vap->va_atime.tv_nsec = VNOVAL; vap->va_mtime.tv_sec = VNOVAL; vap->va_mtime.tv_nsec = VNOVAL; vap->va_ctime.tv_sec = VNOVAL; vap->va_ctime.tv_nsec = VNOVAL; vap->va_birthtime.tv_sec = VNOVAL; vap->va_birthtime.tv_nsec = VNOVAL; vap->va_flags = VNOVAL; vap->va_gen = VNOVAL; vap->va_vaflags = 0; } /* * Try to reduce the total number of vnodes. * * This routine (and its user) are buggy in at least the following ways: * - all parameters were picked years ago when RAM sizes were significantly * smaller * - it can pick vnodes based on pages used by the vm object, but filesystems * like ZFS don't use it making the pick broken * - since ZFS has its own aging policy it gets partially combated by this one * - a dedicated method should be provided for filesystems to let them decide * whether the vnode should be recycled * * This routine is called when we have too many vnodes. It attempts * to free vnodes and will potentially free vnodes that still * have VM backing store (VM backing store is typically the cause * of a vnode blowout so we want to do this). Therefore, this operation * is not considered cheap. * * A number of conditions may prevent a vnode from being reclaimed. * the buffer cache may have references on the vnode, a directory * vnode may still have references due to the namei cache representing * underlying files, or the vnode may be in active use. It is not * desirable to reuse such vnodes. These conditions may cause the * number of vnodes to reach some minimum value regardless of what * you set kern.maxvnodes to. Do not set kern.maxvnodes too low. * * @param reclaim_nc_src Only reclaim directories with outgoing namecache * entries if this argument is strue * @param trigger Only reclaim vnodes with fewer than this many resident * pages. * @param target How many vnodes to reclaim. * @return The number of vnodes that were reclaimed. */ static int vlrureclaim(bool reclaim_nc_src, int trigger, u_long target) { struct vnode *vp, *mvp; struct mount *mp; struct vm_object *object; u_long done; bool retried; mtx_assert(&vnode_list_mtx, MA_OWNED); retried = false; done = 0; mvp = vnode_list_reclaim_marker; restart: vp = mvp; while (done < target) { vp = TAILQ_NEXT(vp, v_vnodelist); if (__predict_false(vp == NULL)) break; if (__predict_false(vp->v_type == VMARKER)) continue; /* * If it's been deconstructed already, it's still * referenced, or it exceeds the trigger, skip it. * Also skip free vnodes. We are trying to make space * to expand the free list, not reduce it. */ if (vp->v_usecount > 0 || vp->v_holdcnt == 0 || (!reclaim_nc_src && !LIST_EMPTY(&vp->v_cache_src))) goto next_iter; if (vp->v_type == VBAD || vp->v_type == VNON) goto next_iter; if (!VI_TRYLOCK(vp)) goto next_iter; if (vp->v_usecount > 0 || vp->v_holdcnt == 0 || (!reclaim_nc_src && !LIST_EMPTY(&vp->v_cache_src)) || VN_IS_DOOMED(vp) || vp->v_type == VNON) { VI_UNLOCK(vp); goto next_iter; } object = atomic_load_ptr(&vp->v_object); if (object == NULL || object->resident_page_count > trigger) { VI_UNLOCK(vp); goto next_iter; } vholdl(vp); VI_UNLOCK(vp); TAILQ_REMOVE(&vnode_list, mvp, v_vnodelist); TAILQ_INSERT_AFTER(&vnode_list, vp, mvp, v_vnodelist); mtx_unlock(&vnode_list_mtx); if (vn_start_write(vp, &mp, V_NOWAIT) != 0) { vdrop(vp); goto next_iter_unlocked; } if (VOP_LOCK(vp, LK_EXCLUSIVE|LK_NOWAIT) != 0) { vdrop(vp); vn_finished_write(mp); goto next_iter_unlocked; } VI_LOCK(vp); if (vp->v_usecount > 0 || (!reclaim_nc_src && !LIST_EMPTY(&vp->v_cache_src)) || (vp->v_object != NULL && vp->v_object->resident_page_count > trigger)) { VOP_UNLOCK(vp); vdropl(vp); vn_finished_write(mp); goto next_iter_unlocked; } counter_u64_add(recycles_count, 1); vgonel(vp); VOP_UNLOCK(vp); vdropl(vp); vn_finished_write(mp); done++; next_iter_unlocked: if (should_yield()) kern_yield(PRI_USER); mtx_lock(&vnode_list_mtx); goto restart; next_iter: MPASS(vp->v_type != VMARKER); if (!should_yield()) continue; TAILQ_REMOVE(&vnode_list, mvp, v_vnodelist); TAILQ_INSERT_AFTER(&vnode_list, vp, mvp, v_vnodelist); mtx_unlock(&vnode_list_mtx); kern_yield(PRI_USER); mtx_lock(&vnode_list_mtx); goto restart; } if (done == 0 && !retried) { TAILQ_REMOVE(&vnode_list, mvp, v_vnodelist); TAILQ_INSERT_HEAD(&vnode_list, mvp, v_vnodelist); retried = true; goto restart; } return (done); } static int max_vnlru_free = 10000; /* limit on vnode free requests per call */ SYSCTL_INT(_debug, OID_AUTO, max_vnlru_free, CTLFLAG_RW, &max_vnlru_free, 0, "limit on vnode free requests per call to the vnlru_free routine"); /* * Attempt to reduce the free list by the requested amount. */ static int vnlru_free_locked(int count, struct vfsops *mnt_op) { struct vnode *vp, *mvp; struct mount *mp; int ocount; mtx_assert(&vnode_list_mtx, MA_OWNED); if (count > max_vnlru_free) count = max_vnlru_free; ocount = count; mvp = vnode_list_free_marker; restart: vp = mvp; while (count > 0) { vp = TAILQ_NEXT(vp, v_vnodelist); if (__predict_false(vp == NULL)) { TAILQ_REMOVE(&vnode_list, mvp, v_vnodelist); TAILQ_INSERT_TAIL(&vnode_list, mvp, v_vnodelist); break; } if (__predict_false(vp->v_type == VMARKER)) continue; /* * Don't recycle if our vnode is from different type * of mount point. Note that mp is type-safe, the * check does not reach unmapped address even if * vnode is reclaimed. * Don't recycle if we can't get the interlock without * blocking. */ if (vp->v_holdcnt > 0 || (mnt_op != NULL && (mp = vp->v_mount) != NULL && mp->mnt_op != mnt_op) || !VI_TRYLOCK(vp)) { continue; } TAILQ_REMOVE(&vnode_list, mvp, v_vnodelist); TAILQ_INSERT_AFTER(&vnode_list, vp, mvp, v_vnodelist); if (__predict_false(vp->v_type == VBAD || vp->v_type == VNON)) { VI_UNLOCK(vp); continue; } vholdl(vp); count--; mtx_unlock(&vnode_list_mtx); VI_UNLOCK(vp); vtryrecycle(vp); vdrop(vp); mtx_lock(&vnode_list_mtx); goto restart; } return (ocount - count); } void vnlru_free(int count, struct vfsops *mnt_op) { mtx_lock(&vnode_list_mtx); vnlru_free_locked(count, mnt_op); mtx_unlock(&vnode_list_mtx); } static void vnlru_recalc(void) { mtx_assert(&vnode_list_mtx, MA_OWNED); gapvnodes = imax(desiredvnodes - wantfreevnodes, 100); vhiwat = gapvnodes / 11; /* 9% -- just under the 10% in vlrureclaim() */ vlowat = vhiwat / 2; } /* * Attempt to recycle vnodes in a context that is always safe to block. * Calling vlrurecycle() from the bowels of filesystem code has some * interesting deadlock problems. */ static struct proc *vnlruproc; static int vnlruproc_sig; /* * The main freevnodes counter is only updated when threads requeue their vnode * batches. CPUs are conditionally walked to compute a more accurate total. * * Limit how much of a slop are we willing to tolerate. Note: the actual value * at any given moment can still exceed slop, but it should not be by significant * margin in practice. */ #define VNLRU_FREEVNODES_SLOP 128 static u_long vnlru_read_freevnodes(void) { struct vdbatch *vd; long slop; int cpu; mtx_assert(&vnode_list_mtx, MA_OWNED); if (freevnodes > freevnodes_old) slop = freevnodes - freevnodes_old; else slop = freevnodes_old - freevnodes; if (slop < VNLRU_FREEVNODES_SLOP) return (freevnodes >= 0 ? freevnodes : 0); freevnodes_old = freevnodes; CPU_FOREACH(cpu) { vd = DPCPU_ID_PTR((cpu), vd); freevnodes_old += vd->freevnodes; } return (freevnodes_old >= 0 ? freevnodes_old : 0); } static bool vnlru_under(u_long rnumvnodes, u_long limit) { u_long rfreevnodes, space; if (__predict_false(rnumvnodes > desiredvnodes)) return (true); space = desiredvnodes - rnumvnodes; if (space < limit) { rfreevnodes = vnlru_read_freevnodes(); if (rfreevnodes > wantfreevnodes) space += rfreevnodes - wantfreevnodes; } return (space < limit); } static bool vnlru_under_unlocked(u_long rnumvnodes, u_long limit) { long rfreevnodes, space; if (__predict_false(rnumvnodes > desiredvnodes)) return (true); space = desiredvnodes - rnumvnodes; if (space < limit) { rfreevnodes = atomic_load_long(&freevnodes); if (rfreevnodes > wantfreevnodes) space += rfreevnodes - wantfreevnodes; } return (space < limit); } static void vnlru_kick(void) { mtx_assert(&vnode_list_mtx, MA_OWNED); if (vnlruproc_sig == 0) { vnlruproc_sig = 1; wakeup(vnlruproc); } } static void vnlru_proc(void) { u_long rnumvnodes, rfreevnodes, target; unsigned long onumvnodes; int done, force, trigger, usevnodes; bool reclaim_nc_src, want_reread; EVENTHANDLER_REGISTER(shutdown_pre_sync, kproc_shutdown, vnlruproc, SHUTDOWN_PRI_FIRST); force = 0; want_reread = false; for (;;) { kproc_suspend_check(vnlruproc); mtx_lock(&vnode_list_mtx); rnumvnodes = atomic_load_long(&numvnodes); if (want_reread) { force = vnlru_under(numvnodes, vhiwat) ? 1 : 0; want_reread = false; } /* * If numvnodes is too large (due to desiredvnodes being * adjusted using its sysctl, or emergency growth), first * try to reduce it by discarding from the free list. */ if (rnumvnodes > desiredvnodes) { vnlru_free_locked(rnumvnodes - desiredvnodes, NULL); rnumvnodes = atomic_load_long(&numvnodes); } /* * Sleep if the vnode cache is in a good state. This is * when it is not over-full and has space for about a 4% * or 9% expansion (by growing its size or inexcessively * reducing its free list). Otherwise, try to reclaim * space for a 10% expansion. */ if (vstir && force == 0) { force = 1; vstir = 0; } if (force == 0 && !vnlru_under(rnumvnodes, vlowat)) { vnlruproc_sig = 0; wakeup(&vnlruproc_sig); msleep(vnlruproc, &vnode_list_mtx, PVFS|PDROP, "vlruwt", hz); continue; } rfreevnodes = vnlru_read_freevnodes(); onumvnodes = rnumvnodes; /* * Calculate parameters for recycling. These are the same * throughout the loop to give some semblance of fairness. * The trigger point is to avoid recycling vnodes with lots * of resident pages. We aren't trying to free memory; we * are trying to recycle or at least free vnodes. */ if (rnumvnodes <= desiredvnodes) usevnodes = rnumvnodes - rfreevnodes; else usevnodes = rnumvnodes; if (usevnodes <= 0) usevnodes = 1; /* * The trigger value is is chosen to give a conservatively * large value to ensure that it alone doesn't prevent * making progress. The value can easily be so large that * it is effectively infinite in some congested and * misconfigured cases, and this is necessary. Normally * it is about 8 to 100 (pages), which is quite large. */ trigger = vm_cnt.v_page_count * 2 / usevnodes; if (force < 2) trigger = vsmalltrigger; reclaim_nc_src = force >= 3; target = rnumvnodes * (int64_t)gapvnodes / imax(desiredvnodes, 1); target = target / 10 + 1; done = vlrureclaim(reclaim_nc_src, trigger, target); mtx_unlock(&vnode_list_mtx); if (onumvnodes > desiredvnodes && numvnodes <= desiredvnodes) uma_reclaim(UMA_RECLAIM_DRAIN); if (done == 0) { if (force == 0 || force == 1) { force = 2; continue; } if (force == 2) { force = 3; continue; } want_reread = true; force = 0; vnlru_nowhere++; tsleep(vnlruproc, PPAUSE, "vlrup", hz * 3); } else { want_reread = true; kern_yield(PRI_USER); } } } static struct kproc_desc vnlru_kp = { "vnlru", vnlru_proc, &vnlruproc }; SYSINIT(vnlru, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &vnlru_kp); /* * Routines having to do with the management of the vnode table. */ /* * Try to recycle a freed vnode. We abort if anyone picks up a reference * before we actually vgone(). This function must be called with the vnode * held to prevent the vnode from being returned to the free list midway * through vgone(). */ static int vtryrecycle(struct vnode *vp) { struct mount *vnmp; CTR2(KTR_VFS, "%s: vp %p", __func__, vp); VNASSERT(vp->v_holdcnt, vp, ("vtryrecycle: Recycling vp %p without a reference.", vp)); /* * This vnode may found and locked via some other list, if so we * can't recycle it yet. */ if (VOP_LOCK(vp, LK_EXCLUSIVE | LK_NOWAIT) != 0) { CTR2(KTR_VFS, "%s: impossible to recycle, vp %p lock is already held", __func__, vp); return (EWOULDBLOCK); } /* * Don't recycle if its filesystem is being suspended. */ if (vn_start_write(vp, &vnmp, V_NOWAIT) != 0) { VOP_UNLOCK(vp); CTR2(KTR_VFS, "%s: impossible to recycle, cannot start the write for %p", __func__, vp); return (EBUSY); } /* * If we got this far, we need to acquire the interlock and see if * anyone picked up this vnode from another list. If not, we will * mark it with DOOMED via vgonel() so that anyone who does find it * will skip over it. */ VI_LOCK(vp); if (vp->v_usecount) { VOP_UNLOCK(vp); VI_UNLOCK(vp); vn_finished_write(vnmp); CTR2(KTR_VFS, "%s: impossible to recycle, %p is already referenced", __func__, vp); return (EBUSY); } if (!VN_IS_DOOMED(vp)) { counter_u64_add(recycles_free_count, 1); vgonel(vp); } VOP_UNLOCK(vp); VI_UNLOCK(vp); vn_finished_write(vnmp); return (0); } /* * Allocate a new vnode. * * The operation never returns an error. Returning an error was disabled * in r145385 (dated 2005) with the following comment: * * XXX Not all VFS_VGET/ffs_vget callers check returns. * * Given the age of this commit (almost 15 years at the time of writing this * comment) restoring the ability to fail requires a significant audit of * all codepaths. * * The routine can try to free a vnode or stall for up to 1 second waiting for * vnlru to clear things up, but ultimately always performs a M_WAITOK allocation. */ static u_long vn_alloc_cyclecount; static struct vnode * __noinline vn_alloc_hard(struct mount *mp) { u_long rnumvnodes, rfreevnodes; mtx_lock(&vnode_list_mtx); rnumvnodes = atomic_load_long(&numvnodes); if (rnumvnodes + 1 < desiredvnodes) { vn_alloc_cyclecount = 0; goto alloc; } rfreevnodes = vnlru_read_freevnodes(); if (vn_alloc_cyclecount++ >= rfreevnodes) { vn_alloc_cyclecount = 0; vstir = 1; } /* * Grow the vnode cache if it will not be above its target max * after growing. Otherwise, if the free list is nonempty, try * to reclaim 1 item from it before growing the cache (possibly * above its target max if the reclamation failed or is delayed). * Otherwise, wait for some space. In all cases, schedule * vnlru_proc() if we are getting short of space. The watermarks * should be chosen so that we never wait or even reclaim from * the free list to below its target minimum. */ if (vnlru_free_locked(1, NULL) > 0) goto alloc; if (mp == NULL || (mp->mnt_kern_flag & MNTK_SUSPEND) == 0) { /* * Wait for space for a new vnode. */ vnlru_kick(); msleep(&vnlruproc_sig, &vnode_list_mtx, PVFS, "vlruwk", hz); if (atomic_load_long(&numvnodes) + 1 > desiredvnodes && vnlru_read_freevnodes() > 1) vnlru_free_locked(1, NULL); } alloc: rnumvnodes = atomic_fetchadd_long(&numvnodes, 1) + 1; if (vnlru_under(rnumvnodes, vlowat)) vnlru_kick(); mtx_unlock(&vnode_list_mtx); return (uma_zalloc_smr(vnode_zone, M_WAITOK)); } static struct vnode * vn_alloc(struct mount *mp) { u_long rnumvnodes; if (__predict_false(vn_alloc_cyclecount != 0)) return (vn_alloc_hard(mp)); rnumvnodes = atomic_fetchadd_long(&numvnodes, 1) + 1; if (__predict_false(vnlru_under_unlocked(rnumvnodes, vlowat))) { atomic_subtract_long(&numvnodes, 1); return (vn_alloc_hard(mp)); } return (uma_zalloc_smr(vnode_zone, M_WAITOK)); } static void vn_free(struct vnode *vp) { atomic_subtract_long(&numvnodes, 1); uma_zfree_smr(vnode_zone, vp); } /* * Return the next vnode from the free list. */ int getnewvnode(const char *tag, struct mount *mp, struct vop_vector *vops, struct vnode **vpp) { struct vnode *vp; struct thread *td; struct lock_object *lo; CTR3(KTR_VFS, "%s: mp %p with tag %s", __func__, mp, tag); KASSERT(vops->registered, ("%s: not registered vector op %p\n", __func__, vops)); td = curthread; if (td->td_vp_reserved != NULL) { vp = td->td_vp_reserved; td->td_vp_reserved = NULL; } else { vp = vn_alloc(mp); } counter_u64_add(vnodes_created, 1); /* * Locks are given the generic name "vnode" when created. * Follow the historic practice of using the filesystem * name when they allocated, e.g., "zfs", "ufs", "nfs, etc. * * Locks live in a witness group keyed on their name. Thus, * when a lock is renamed, it must also move from the witness * group of its old name to the witness group of its new name. * * The change only needs to be made when the vnode moves * from one filesystem type to another. We ensure that each * filesystem use a single static name pointer for its tag so * that we can compare pointers rather than doing a strcmp(). */ lo = &vp->v_vnlock->lock_object; #ifdef WITNESS if (lo->lo_name != tag) { #endif lo->lo_name = tag; #ifdef WITNESS WITNESS_DESTROY(lo); WITNESS_INIT(lo, tag); } #endif /* * By default, don't allow shared locks unless filesystems opt-in. */ vp->v_vnlock->lock_object.lo_flags |= LK_NOSHARE; /* * Finalize various vnode identity bits. */ KASSERT(vp->v_object == NULL, ("stale v_object %p", vp)); KASSERT(vp->v_lockf == NULL, ("stale v_lockf %p", vp)); KASSERT(vp->v_pollinfo == NULL, ("stale v_pollinfo %p", vp)); vp->v_type = VNON; vp->v_op = vops; v_init_counters(vp); vp->v_bufobj.bo_ops = &buf_ops_bio; #ifdef DIAGNOSTIC if (mp == NULL && vops != &dead_vnodeops) printf("NULL mp in getnewvnode(9), tag %s\n", tag); #endif #ifdef MAC mac_vnode_init(vp); if (mp != NULL && (mp->mnt_flag & MNT_MULTILABEL) == 0) mac_vnode_associate_singlelabel(mp, vp); #endif if (mp != NULL) { vp->v_bufobj.bo_bsize = mp->mnt_stat.f_iosize; if ((mp->mnt_kern_flag & MNTK_NOKNOTE) != 0) vp->v_vflag |= VV_NOKNOTE; } /* * For the filesystems which do not use vfs_hash_insert(), * still initialize v_hash to have vfs_hash_index() useful. * E.g., nullfs uses vfs_hash_index() on the lower vnode for * its own hashing. */ vp->v_hash = (uintptr_t)vp >> vnsz2log; *vpp = vp; return (0); } void getnewvnode_reserve(void) { struct thread *td; td = curthread; MPASS(td->td_vp_reserved == NULL); td->td_vp_reserved = vn_alloc(NULL); } void getnewvnode_drop_reserve(void) { struct thread *td; td = curthread; if (td->td_vp_reserved != NULL) { vn_free(td->td_vp_reserved); td->td_vp_reserved = NULL; } } static void freevnode(struct vnode *vp) { struct bufobj *bo; /* * The vnode has been marked for destruction, so free it. * * The vnode will be returned to the zone where it will * normally remain until it is needed for another vnode. We * need to cleanup (or verify that the cleanup has already * been done) any residual data left from its current use * so as not to contaminate the freshly allocated vnode. */ CTR2(KTR_VFS, "%s: destroying the vnode %p", __func__, vp); /* * Paired with vgone. */ vn_seqc_write_end_locked(vp); VNPASS(vp->v_seqc_users == 0, vp); bo = &vp->v_bufobj; VNASSERT(vp->v_data == NULL, vp, ("cleaned vnode isn't")); VNPASS(vp->v_holdcnt == VHOLD_NO_SMR, vp); VNASSERT(vp->v_usecount == 0, vp, ("Non-zero use count")); VNASSERT(vp->v_writecount == 0, vp, ("Non-zero write count")); VNASSERT(bo->bo_numoutput == 0, vp, ("Clean vnode has pending I/O's")); VNASSERT(bo->bo_clean.bv_cnt == 0, vp, ("cleanbufcnt not 0")); VNASSERT(pctrie_is_empty(&bo->bo_clean.bv_root), vp, ("clean blk trie not empty")); VNASSERT(bo->bo_dirty.bv_cnt == 0, vp, ("dirtybufcnt not 0")); VNASSERT(pctrie_is_empty(&bo->bo_dirty.bv_root), vp, ("dirty blk trie not empty")); VNASSERT(TAILQ_EMPTY(&vp->v_cache_dst), vp, ("vp has namecache dst")); VNASSERT(LIST_EMPTY(&vp->v_cache_src), vp, ("vp has namecache src")); VNASSERT(vp->v_cache_dd == NULL, vp, ("vp has namecache for ..")); VNASSERT(TAILQ_EMPTY(&vp->v_rl.rl_waiters), vp, ("Dangling rangelock waiters")); VI_UNLOCK(vp); #ifdef MAC mac_vnode_destroy(vp); #endif if (vp->v_pollinfo != NULL) { destroy_vpollinfo(vp->v_pollinfo); vp->v_pollinfo = NULL; } #ifdef INVARIANTS /* XXX Elsewhere we detect an already freed vnode via NULL v_op. */ vp->v_op = NULL; #endif vp->v_mountedhere = NULL; vp->v_unpcb = NULL; vp->v_rdev = NULL; vp->v_fifoinfo = NULL; vp->v_lasta = vp->v_clen = vp->v_cstart = vp->v_lastw = 0; vp->v_irflag = 0; vp->v_iflag = 0; vp->v_vflag = 0; bo->bo_flag = 0; vn_free(vp); } /* * Delete from old mount point vnode list, if on one. */ static void delmntque(struct vnode *vp) { struct mount *mp; VNPASS((vp->v_mflag & VMP_LAZYLIST) == 0, vp); mp = vp->v_mount; if (mp == NULL) return; MNT_ILOCK(mp); VI_LOCK(vp); vp->v_mount = NULL; VI_UNLOCK(vp); VNASSERT(mp->mnt_nvnodelistsize > 0, vp, ("bad mount point vnode list size")); TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes); mp->mnt_nvnodelistsize--; MNT_REL(mp); MNT_IUNLOCK(mp); } static void insmntque_stddtr(struct vnode *vp, void *dtr_arg) { vp->v_data = NULL; vp->v_op = &dead_vnodeops; vgone(vp); vput(vp); } /* * Insert into list of vnodes for the new mount point, if available. */ int insmntque1(struct vnode *vp, struct mount *mp, void (*dtr)(struct vnode *, void *), void *dtr_arg) { KASSERT(vp->v_mount == NULL, ("insmntque: vnode already on per mount vnode list")); VNASSERT(mp != NULL, vp, ("Don't call insmntque(foo, NULL)")); ASSERT_VOP_ELOCKED(vp, "insmntque: non-locked vp"); /* * We acquire the vnode interlock early to ensure that the * vnode cannot be recycled by another process releasing a * holdcnt on it before we get it on both the vnode list * and the active vnode list. The mount mutex protects only * manipulation of the vnode list and the vnode freelist * mutex protects only manipulation of the active vnode list. * Hence the need to hold the vnode interlock throughout. */ MNT_ILOCK(mp); VI_LOCK(vp); if (((mp->mnt_kern_flag & MNTK_UNMOUNT) != 0 && ((mp->mnt_kern_flag & MNTK_UNMOUNTF) != 0 || mp->mnt_nvnodelistsize == 0)) && (vp->v_vflag & VV_FORCEINSMQ) == 0) { VI_UNLOCK(vp); MNT_IUNLOCK(mp); if (dtr != NULL) dtr(vp, dtr_arg); return (EBUSY); } vp->v_mount = mp; MNT_REF(mp); TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes); VNASSERT(mp->mnt_nvnodelistsize >= 0, vp, ("neg mount point vnode list size")); mp->mnt_nvnodelistsize++; VI_UNLOCK(vp); MNT_IUNLOCK(mp); return (0); } int insmntque(struct vnode *vp, struct mount *mp) { return (insmntque1(vp, mp, insmntque_stddtr, NULL)); } /* * Flush out and invalidate all buffers associated with a bufobj * Called with the underlying object locked. */ int bufobj_invalbuf(struct bufobj *bo, int flags, int slpflag, int slptimeo) { int error; BO_LOCK(bo); if (flags & V_SAVE) { error = bufobj_wwait(bo, slpflag, slptimeo); if (error) { BO_UNLOCK(bo); return (error); } if (bo->bo_dirty.bv_cnt > 0) { BO_UNLOCK(bo); if ((error = BO_SYNC(bo, MNT_WAIT)) != 0) return (error); /* * XXX We could save a lock/unlock if this was only * enabled under INVARIANTS */ BO_LOCK(bo); if (bo->bo_numoutput > 0 || bo->bo_dirty.bv_cnt > 0) panic("vinvalbuf: dirty bufs"); } } /* * If you alter this loop please notice that interlock is dropped and * reacquired in flushbuflist. Special care is needed to ensure that * no race conditions occur from this. */ do { error = flushbuflist(&bo->bo_clean, flags, bo, slpflag, slptimeo); if (error == 0 && !(flags & V_CLEANONLY)) error = flushbuflist(&bo->bo_dirty, flags, bo, slpflag, slptimeo); if (error != 0 && error != EAGAIN) { BO_UNLOCK(bo); return (error); } } while (error != 0); /* * Wait for I/O to complete. XXX needs cleaning up. The vnode can * have write I/O in-progress but if there is a VM object then the * VM object can also have read-I/O in-progress. */ do { bufobj_wwait(bo, 0, 0); if ((flags & V_VMIO) == 0 && bo->bo_object != NULL) { BO_UNLOCK(bo); vm_object_pip_wait_unlocked(bo->bo_object, "bovlbx"); BO_LOCK(bo); } } while (bo->bo_numoutput > 0); BO_UNLOCK(bo); /* * Destroy the copy in the VM cache, too. */ if (bo->bo_object != NULL && (flags & (V_ALT | V_NORMAL | V_CLEANONLY | V_VMIO)) == 0) { VM_OBJECT_WLOCK(bo->bo_object); vm_object_page_remove(bo->bo_object, 0, 0, (flags & V_SAVE) ? OBJPR_CLEANONLY : 0); VM_OBJECT_WUNLOCK(bo->bo_object); } #ifdef INVARIANTS BO_LOCK(bo); if ((flags & (V_ALT | V_NORMAL | V_CLEANONLY | V_VMIO | V_ALLOWCLEAN)) == 0 && (bo->bo_dirty.bv_cnt > 0 || bo->bo_clean.bv_cnt > 0)) panic("vinvalbuf: flush failed"); if ((flags & (V_ALT | V_NORMAL | V_CLEANONLY | V_VMIO)) == 0 && bo->bo_dirty.bv_cnt > 0) panic("vinvalbuf: flush dirty failed"); BO_UNLOCK(bo); #endif return (0); } /* * Flush out and invalidate all buffers associated with a vnode. * Called with the underlying object locked. */ int vinvalbuf(struct vnode *vp, int flags, int slpflag, int slptimeo) { CTR3(KTR_VFS, "%s: vp %p with flags %d", __func__, vp, flags); ASSERT_VOP_LOCKED(vp, "vinvalbuf"); if (vp->v_object != NULL && vp->v_object->handle != vp) return (0); return (bufobj_invalbuf(&vp->v_bufobj, flags, slpflag, slptimeo)); } /* * Flush out buffers on the specified list. * */ static int flushbuflist(struct bufv *bufv, int flags, struct bufobj *bo, int slpflag, int slptimeo) { struct buf *bp, *nbp; int retval, error; daddr_t lblkno; b_xflags_t xflags; ASSERT_BO_WLOCKED(bo); retval = 0; TAILQ_FOREACH_SAFE(bp, &bufv->bv_hd, b_bobufs, nbp) { /* * If we are flushing both V_NORMAL and V_ALT buffers then * do not skip any buffers. If we are flushing only V_NORMAL * buffers then skip buffers marked as BX_ALTDATA. If we are * flushing only V_ALT buffers then skip buffers not marked * as BX_ALTDATA. */ if (((flags & (V_NORMAL | V_ALT)) != (V_NORMAL | V_ALT)) && (((flags & V_NORMAL) && (bp->b_xflags & BX_ALTDATA) != 0) || ((flags & V_ALT) && (bp->b_xflags & BX_ALTDATA) == 0))) { continue; } if (nbp != NULL) { lblkno = nbp->b_lblkno; xflags = nbp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN); } retval = EAGAIN; error = BUF_TIMELOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, BO_LOCKPTR(bo), "flushbuf", slpflag, slptimeo); if (error) { BO_LOCK(bo); return (error != ENOLCK ? error : EAGAIN); } KASSERT(bp->b_bufobj == bo, ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); /* * XXX Since there are no node locks for NFS, I * believe there is a slight chance that a delayed * write will occur while sleeping just above, so * check for it. */ if (((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) && (flags & V_SAVE)) { bremfree(bp); bp->b_flags |= B_ASYNC; bwrite(bp); BO_LOCK(bo); return (EAGAIN); /* XXX: why not loop ? */ } bremfree(bp); bp->b_flags |= (B_INVAL | B_RELBUF); bp->b_flags &= ~B_ASYNC; brelse(bp); BO_LOCK(bo); if (nbp == NULL) break; nbp = gbincore(bo, lblkno); if (nbp == NULL || (nbp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN)) != xflags) break; /* nbp invalid */ } return (retval); } int bnoreuselist(struct bufv *bufv, struct bufobj *bo, daddr_t startn, daddr_t endn) { struct buf *bp; int error; daddr_t lblkno; ASSERT_BO_LOCKED(bo); for (lblkno = startn;;) { again: bp = BUF_PCTRIE_LOOKUP_GE(&bufv->bv_root, lblkno); if (bp == NULL || bp->b_lblkno >= endn || bp->b_lblkno < startn) break; error = BUF_TIMELOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, BO_LOCKPTR(bo), "brlsfl", 0, 0); if (error != 0) { BO_RLOCK(bo); if (error == ENOLCK) goto again; return (error); } KASSERT(bp->b_bufobj == bo, ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); lblkno = bp->b_lblkno + 1; if ((bp->b_flags & B_MANAGED) == 0) bremfree(bp); bp->b_flags |= B_RELBUF; /* * In the VMIO case, use the B_NOREUSE flag to hint that the * pages backing each buffer in the range are unlikely to be * reused. Dirty buffers will have the hint applied once * they've been written. */ if ((bp->b_flags & B_VMIO) != 0) bp->b_flags |= B_NOREUSE; brelse(bp); BO_RLOCK(bo); } return (0); } /* * Truncate a file's buffer and pages to a specified length. This * is in lieu of the old vinvalbuf mechanism, which performed unneeded * sync activity. */ int vtruncbuf(struct vnode *vp, off_t length, int blksize) { struct buf *bp, *nbp; struct bufobj *bo; daddr_t startlbn; CTR4(KTR_VFS, "%s: vp %p with block %d:%ju", __func__, vp, blksize, (uintmax_t)length); /* * Round up to the *next* lbn. */ startlbn = howmany(length, blksize); ASSERT_VOP_LOCKED(vp, "vtruncbuf"); bo = &vp->v_bufobj; restart_unlocked: BO_LOCK(bo); while (v_inval_buf_range_locked(vp, bo, startlbn, INT64_MAX) == EAGAIN) ; if (length > 0) { restartsync: TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) { if (bp->b_lblkno > 0) continue; /* * Since we hold the vnode lock this should only * fail if we're racing with the buf daemon. */ if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, BO_LOCKPTR(bo)) == ENOLCK) goto restart_unlocked; VNASSERT((bp->b_flags & B_DELWRI), vp, ("buf(%p) on dirty queue without DELWRI", bp)); bremfree(bp); bawrite(bp); BO_LOCK(bo); goto restartsync; } } bufobj_wwait(bo, 0, 0); BO_UNLOCK(bo); vnode_pager_setsize(vp, length); return (0); } /* * Invalidate the cached pages of a file's buffer within the range of block * numbers [startlbn, endlbn). */ void v_inval_buf_range(struct vnode *vp, daddr_t startlbn, daddr_t endlbn, int blksize) { struct bufobj *bo; off_t start, end; ASSERT_VOP_LOCKED(vp, "v_inval_buf_range"); start = blksize * startlbn; end = blksize * endlbn; bo = &vp->v_bufobj; BO_LOCK(bo); MPASS(blksize == bo->bo_bsize); while (v_inval_buf_range_locked(vp, bo, startlbn, endlbn) == EAGAIN) ; BO_UNLOCK(bo); vn_pages_remove(vp, OFF_TO_IDX(start), OFF_TO_IDX(end + PAGE_SIZE - 1)); } static int v_inval_buf_range_locked(struct vnode *vp, struct bufobj *bo, daddr_t startlbn, daddr_t endlbn) { struct buf *bp, *nbp; bool anyfreed; ASSERT_VOP_LOCKED(vp, "v_inval_buf_range_locked"); ASSERT_BO_LOCKED(bo); do { anyfreed = false; TAILQ_FOREACH_SAFE(bp, &bo->bo_clean.bv_hd, b_bobufs, nbp) { if (bp->b_lblkno < startlbn || bp->b_lblkno >= endlbn) continue; if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, BO_LOCKPTR(bo)) == ENOLCK) { BO_LOCK(bo); return (EAGAIN); } bremfree(bp); bp->b_flags |= B_INVAL | B_RELBUF; bp->b_flags &= ~B_ASYNC; brelse(bp); anyfreed = true; BO_LOCK(bo); if (nbp != NULL && (((nbp->b_xflags & BX_VNCLEAN) == 0) || nbp->b_vp != vp || (nbp->b_flags & B_DELWRI) != 0)) return (EAGAIN); } TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) { if (bp->b_lblkno < startlbn || bp->b_lblkno >= endlbn) continue; if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, BO_LOCKPTR(bo)) == ENOLCK) { BO_LOCK(bo); return (EAGAIN); } bremfree(bp); bp->b_flags |= B_INVAL | B_RELBUF; bp->b_flags &= ~B_ASYNC; brelse(bp); anyfreed = true; BO_LOCK(bo); if (nbp != NULL && (((nbp->b_xflags & BX_VNDIRTY) == 0) || (nbp->b_vp != vp) || (nbp->b_flags & B_DELWRI) == 0)) return (EAGAIN); } } while (anyfreed); return (0); } static void buf_vlist_remove(struct buf *bp) { struct bufv *bv; b_xflags_t flags; flags = bp->b_xflags; KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp)); ASSERT_BO_WLOCKED(bp->b_bufobj); KASSERT((flags & (BX_VNDIRTY | BX_VNCLEAN)) != 0 && (flags & (BX_VNDIRTY | BX_VNCLEAN)) != (BX_VNDIRTY | BX_VNCLEAN), ("%s: buffer %p has invalid queue state", __func__, bp)); if ((flags & BX_VNDIRTY) != 0) bv = &bp->b_bufobj->bo_dirty; else bv = &bp->b_bufobj->bo_clean; BUF_PCTRIE_REMOVE(&bv->bv_root, bp->b_lblkno); TAILQ_REMOVE(&bv->bv_hd, bp, b_bobufs); bv->bv_cnt--; bp->b_xflags &= ~(BX_VNDIRTY | BX_VNCLEAN); } /* * Add the buffer to the sorted clean or dirty block list. * * NOTE: xflags is passed as a constant, optimizing this inline function! */ static void buf_vlist_add(struct buf *bp, struct bufobj *bo, b_xflags_t xflags) { struct bufv *bv; struct buf *n; int error; ASSERT_BO_WLOCKED(bo); KASSERT((bo->bo_flag & BO_NOBUFS) == 0, ("buf_vlist_add: bo %p does not allow bufs", bo)); KASSERT((xflags & BX_VNDIRTY) == 0 || (bo->bo_flag & BO_DEAD) == 0, ("dead bo %p", bo)); KASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) == 0, ("buf_vlist_add: Buf %p has existing xflags %d", bp, bp->b_xflags)); bp->b_xflags |= xflags; if (xflags & BX_VNDIRTY) bv = &bo->bo_dirty; else bv = &bo->bo_clean; /* * Keep the list ordered. Optimize empty list insertion. Assume * we tend to grow at the tail so lookup_le should usually be cheaper * than _ge. */ if (bv->bv_cnt == 0 || bp->b_lblkno > TAILQ_LAST(&bv->bv_hd, buflists)->b_lblkno) TAILQ_INSERT_TAIL(&bv->bv_hd, bp, b_bobufs); else if ((n = BUF_PCTRIE_LOOKUP_LE(&bv->bv_root, bp->b_lblkno)) == NULL) TAILQ_INSERT_HEAD(&bv->bv_hd, bp, b_bobufs); else TAILQ_INSERT_AFTER(&bv->bv_hd, n, bp, b_bobufs); error = BUF_PCTRIE_INSERT(&bv->bv_root, bp); if (error) panic("buf_vlist_add: Preallocated nodes insufficient."); bv->bv_cnt++; } /* * Look up a buffer using the buffer tries. */ struct buf * gbincore(struct bufobj *bo, daddr_t lblkno) { struct buf *bp; ASSERT_BO_LOCKED(bo); bp = BUF_PCTRIE_LOOKUP(&bo->bo_clean.bv_root, lblkno); if (bp != NULL) return (bp); return (BUF_PCTRIE_LOOKUP(&bo->bo_dirty.bv_root, lblkno)); } /* * Look up a buf using the buffer tries, without the bufobj lock. This relies * on SMR for safe lookup, and bufs being in a no-free zone to provide type * stability of the result. Like other lockless lookups, the found buf may * already be invalid by the time this function returns. */ struct buf * gbincore_unlocked(struct bufobj *bo, daddr_t lblkno) { struct buf *bp; ASSERT_BO_UNLOCKED(bo); bp = BUF_PCTRIE_LOOKUP_UNLOCKED(&bo->bo_clean.bv_root, lblkno); if (bp != NULL) return (bp); return (BUF_PCTRIE_LOOKUP_UNLOCKED(&bo->bo_dirty.bv_root, lblkno)); } /* * Associate a buffer with a vnode. */ void bgetvp(struct vnode *vp, struct buf *bp) { struct bufobj *bo; bo = &vp->v_bufobj; ASSERT_BO_WLOCKED(bo); VNASSERT(bp->b_vp == NULL, bp->b_vp, ("bgetvp: not free")); CTR3(KTR_BUF, "bgetvp(%p) vp %p flags %X", bp, vp, bp->b_flags); VNASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) == 0, vp, ("bgetvp: bp already attached! %p", bp)); vhold(vp); bp->b_vp = vp; bp->b_bufobj = bo; /* * Insert onto list for new vnode. */ buf_vlist_add(bp, bo, BX_VNCLEAN); } /* * Disassociate a buffer from a vnode. */ void brelvp(struct buf *bp) { struct bufobj *bo; struct vnode *vp; CTR3(KTR_BUF, "brelvp(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); KASSERT(bp->b_vp != NULL, ("brelvp: NULL")); /* * Delete from old vnode list, if on one. */ vp = bp->b_vp; /* XXX */ bo = bp->b_bufobj; BO_LOCK(bo); buf_vlist_remove(bp); if ((bo->bo_flag & BO_ONWORKLST) && bo->bo_dirty.bv_cnt == 0) { bo->bo_flag &= ~BO_ONWORKLST; mtx_lock(&sync_mtx); LIST_REMOVE(bo, bo_synclist); syncer_worklist_len--; mtx_unlock(&sync_mtx); } bp->b_vp = NULL; bp->b_bufobj = NULL; BO_UNLOCK(bo); vdrop(vp); } /* * Add an item to the syncer work queue. */ static void vn_syncer_add_to_worklist(struct bufobj *bo, int delay) { int slot; ASSERT_BO_WLOCKED(bo); mtx_lock(&sync_mtx); if (bo->bo_flag & BO_ONWORKLST) LIST_REMOVE(bo, bo_synclist); else { bo->bo_flag |= BO_ONWORKLST; syncer_worklist_len++; } if (delay > syncer_maxdelay - 2) delay = syncer_maxdelay - 2; slot = (syncer_delayno + delay) & syncer_mask; LIST_INSERT_HEAD(&syncer_workitem_pending[slot], bo, bo_synclist); mtx_unlock(&sync_mtx); } static int sysctl_vfs_worklist_len(SYSCTL_HANDLER_ARGS) { int error, len; mtx_lock(&sync_mtx); len = syncer_worklist_len - sync_vnode_count; mtx_unlock(&sync_mtx); error = SYSCTL_OUT(req, &len, sizeof(len)); return (error); } SYSCTL_PROC(_vfs, OID_AUTO, worklist_len, CTLTYPE_INT | CTLFLAG_MPSAFE| CTLFLAG_RD, NULL, 0, sysctl_vfs_worklist_len, "I", "Syncer thread worklist length"); static struct proc *updateproc; static void sched_sync(void); static struct kproc_desc up_kp = { "syncer", sched_sync, &updateproc }; SYSINIT(syncer, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &up_kp); static int sync_vnode(struct synclist *slp, struct bufobj **bo, struct thread *td) { struct vnode *vp; struct mount *mp; *bo = LIST_FIRST(slp); if (*bo == NULL) return (0); vp = bo2vnode(*bo); if (VOP_ISLOCKED(vp) != 0 || VI_TRYLOCK(vp) == 0) return (1); /* * We use vhold in case the vnode does not * successfully sync. vhold prevents the vnode from * going away when we unlock the sync_mtx so that * we can acquire the vnode interlock. */ vholdl(vp); mtx_unlock(&sync_mtx); VI_UNLOCK(vp); if (vn_start_write(vp, &mp, V_NOWAIT) != 0) { vdrop(vp); mtx_lock(&sync_mtx); return (*bo == LIST_FIRST(slp)); } vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); (void) VOP_FSYNC(vp, MNT_LAZY, td); VOP_UNLOCK(vp); vn_finished_write(mp); BO_LOCK(*bo); if (((*bo)->bo_flag & BO_ONWORKLST) != 0) { /* * Put us back on the worklist. The worklist * routine will remove us from our current * position and then add us back in at a later * position. */ vn_syncer_add_to_worklist(*bo, syncdelay); } BO_UNLOCK(*bo); vdrop(vp); mtx_lock(&sync_mtx); return (0); } static int first_printf = 1; /* * System filesystem synchronizer daemon. */ static void sched_sync(void) { struct synclist *next, *slp; struct bufobj *bo; long starttime; struct thread *td = curthread; int last_work_seen; int net_worklist_len; int syncer_final_iter; int error; last_work_seen = 0; syncer_final_iter = 0; syncer_state = SYNCER_RUNNING; starttime = time_uptime; td->td_pflags |= TDP_NORUNNINGBUF; EVENTHANDLER_REGISTER(shutdown_pre_sync, syncer_shutdown, td->td_proc, SHUTDOWN_PRI_LAST); mtx_lock(&sync_mtx); for (;;) { if (syncer_state == SYNCER_FINAL_DELAY && syncer_final_iter == 0) { mtx_unlock(&sync_mtx); kproc_suspend_check(td->td_proc); mtx_lock(&sync_mtx); } net_worklist_len = syncer_worklist_len - sync_vnode_count; if (syncer_state != SYNCER_RUNNING && starttime != time_uptime) { if (first_printf) { printf("\nSyncing disks, vnodes remaining... "); first_printf = 0; } printf("%d ", net_worklist_len); } starttime = time_uptime; /* * Push files whose dirty time has expired. Be careful * of interrupt race on slp queue. * * Skip over empty worklist slots when shutting down. */ do { slp = &syncer_workitem_pending[syncer_delayno]; syncer_delayno += 1; if (syncer_delayno == syncer_maxdelay) syncer_delayno = 0; next = &syncer_workitem_pending[syncer_delayno]; /* * If the worklist has wrapped since the * it was emptied of all but syncer vnodes, * switch to the FINAL_DELAY state and run * for one more second. */ if (syncer_state == SYNCER_SHUTTING_DOWN && net_worklist_len == 0 && last_work_seen == syncer_delayno) { syncer_state = SYNCER_FINAL_DELAY; syncer_final_iter = SYNCER_SHUTDOWN_SPEEDUP; } } while (syncer_state != SYNCER_RUNNING && LIST_EMPTY(slp) && syncer_worklist_len > 0); /* * Keep track of the last time there was anything * on the worklist other than syncer vnodes. * Return to the SHUTTING_DOWN state if any * new work appears. */ if (net_worklist_len > 0 || syncer_state == SYNCER_RUNNING) last_work_seen = syncer_delayno; if (net_worklist_len > 0 && syncer_state == SYNCER_FINAL_DELAY) syncer_state = SYNCER_SHUTTING_DOWN; while (!LIST_EMPTY(slp)) { error = sync_vnode(slp, &bo, td); if (error == 1) { LIST_REMOVE(bo, bo_synclist); LIST_INSERT_HEAD(next, bo, bo_synclist); continue; } if (first_printf == 0) { /* * Drop the sync mutex, because some watchdog * drivers need to sleep while patting */ mtx_unlock(&sync_mtx); wdog_kern_pat(WD_LASTVAL); mtx_lock(&sync_mtx); } } if (syncer_state == SYNCER_FINAL_DELAY && syncer_final_iter > 0) syncer_final_iter--; /* * The variable rushjob allows the kernel to speed up the * processing of the filesystem syncer process. A rushjob * value of N tells the filesystem syncer to process the next * N seconds worth of work on its queue ASAP. Currently rushjob * is used by the soft update code to speed up the filesystem * syncer process when the incore state is getting so far * ahead of the disk that the kernel memory pool is being * threatened with exhaustion. */ if (rushjob > 0) { rushjob -= 1; continue; } /* * Just sleep for a short period of time between * iterations when shutting down to allow some I/O * to happen. * * If it has taken us less than a second to process the * current work, then wait. Otherwise start right over * again. We can still lose time if any single round * takes more than two seconds, but it does not really * matter as we are just trying to generally pace the * filesystem activity. */ if (syncer_state != SYNCER_RUNNING || time_uptime == starttime) { thread_lock(td); sched_prio(td, PPAUSE); thread_unlock(td); } if (syncer_state != SYNCER_RUNNING) cv_timedwait(&sync_wakeup, &sync_mtx, hz / SYNCER_SHUTDOWN_SPEEDUP); else if (time_uptime == starttime) cv_timedwait(&sync_wakeup, &sync_mtx, hz); } } /* * Request the syncer daemon to speed up its work. * We never push it to speed up more than half of its * normal turn time, otherwise it could take over the cpu. */ int speedup_syncer(void) { int ret = 0; mtx_lock(&sync_mtx); if (rushjob < syncdelay / 2) { rushjob += 1; stat_rush_requests += 1; ret = 1; } mtx_unlock(&sync_mtx); cv_broadcast(&sync_wakeup); return (ret); } /* * Tell the syncer to speed up its work and run though its work * list several times, then tell it to shut down. */ static void syncer_shutdown(void *arg, int howto) { if (howto & RB_NOSYNC) return; mtx_lock(&sync_mtx); syncer_state = SYNCER_SHUTTING_DOWN; rushjob = 0; mtx_unlock(&sync_mtx); cv_broadcast(&sync_wakeup); kproc_shutdown(arg, howto); } void syncer_suspend(void) { syncer_shutdown(updateproc, 0); } void syncer_resume(void) { mtx_lock(&sync_mtx); first_printf = 1; syncer_state = SYNCER_RUNNING; mtx_unlock(&sync_mtx); cv_broadcast(&sync_wakeup); kproc_resume(updateproc); } /* * Move the buffer between the clean and dirty lists of its vnode. */ void reassignbuf(struct buf *bp) { struct vnode *vp; struct bufobj *bo; int delay; #ifdef INVARIANTS struct bufv *bv; #endif vp = bp->b_vp; bo = bp->b_bufobj; KASSERT((bp->b_flags & B_PAGING) == 0, ("%s: cannot reassign paging buffer %p", __func__, bp)); CTR3(KTR_BUF, "reassignbuf(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); BO_LOCK(bo); buf_vlist_remove(bp); /* * If dirty, put on list of dirty buffers; otherwise insert onto list * of clean buffers. */ if (bp->b_flags & B_DELWRI) { if ((bo->bo_flag & BO_ONWORKLST) == 0) { switch (vp->v_type) { case VDIR: delay = dirdelay; break; case VCHR: delay = metadelay; break; default: delay = filedelay; } vn_syncer_add_to_worklist(bo, delay); } buf_vlist_add(bp, bo, BX_VNDIRTY); } else { buf_vlist_add(bp, bo, BX_VNCLEAN); if ((bo->bo_flag & BO_ONWORKLST) && bo->bo_dirty.bv_cnt == 0) { mtx_lock(&sync_mtx); LIST_REMOVE(bo, bo_synclist); syncer_worklist_len--; mtx_unlock(&sync_mtx); bo->bo_flag &= ~BO_ONWORKLST; } } #ifdef INVARIANTS bv = &bo->bo_clean; bp = TAILQ_FIRST(&bv->bv_hd); KASSERT(bp == NULL || bp->b_bufobj == bo, ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); bp = TAILQ_LAST(&bv->bv_hd, buflists); KASSERT(bp == NULL || bp->b_bufobj == bo, ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); bv = &bo->bo_dirty; bp = TAILQ_FIRST(&bv->bv_hd); KASSERT(bp == NULL || bp->b_bufobj == bo, ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); bp = TAILQ_LAST(&bv->bv_hd, buflists); KASSERT(bp == NULL || bp->b_bufobj == bo, ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); #endif BO_UNLOCK(bo); } static void v_init_counters(struct vnode *vp) { VNASSERT(vp->v_type == VNON && vp->v_data == NULL && vp->v_iflag == 0, vp, ("%s called for an initialized vnode", __FUNCTION__)); ASSERT_VI_UNLOCKED(vp, __FUNCTION__); refcount_init(&vp->v_holdcnt, 1); refcount_init(&vp->v_usecount, 1); } /* * Grab a particular vnode from the free list, increment its * reference count and lock it. VIRF_DOOMED is set if the vnode * is being destroyed. Only callers who specify LK_RETRY will * see doomed vnodes. If inactive processing was delayed in * vput try to do it here. * * usecount is manipulated using atomics without holding any locks. * * holdcnt can be manipulated using atomics without holding any locks, * except when transitioning 1<->0, in which case the interlock is held. * * Consumers which don't guarantee liveness of the vnode can use SMR to * try to get a reference. Note this operation can fail since the vnode * may be awaiting getting freed by the time they get to it. */ enum vgetstate vget_prep_smr(struct vnode *vp) { enum vgetstate vs; VFS_SMR_ASSERT_ENTERED(); if (refcount_acquire_if_not_zero(&vp->v_usecount)) { vs = VGET_USECOUNT; } else { if (vhold_smr(vp)) vs = VGET_HOLDCNT; else vs = VGET_NONE; } return (vs); } enum vgetstate vget_prep(struct vnode *vp) { enum vgetstate vs; if (refcount_acquire_if_not_zero(&vp->v_usecount)) { vs = VGET_USECOUNT; } else { vhold(vp); vs = VGET_HOLDCNT; } return (vs); } void vget_abort(struct vnode *vp, enum vgetstate vs) { switch (vs) { case VGET_USECOUNT: vrele(vp); break; case VGET_HOLDCNT: vdrop(vp); break; default: __assert_unreachable(); } } int vget(struct vnode *vp, int flags) { enum vgetstate vs; vs = vget_prep(vp); return (vget_finish(vp, flags, vs)); } int vget_finish(struct vnode *vp, int flags, enum vgetstate vs) { int error; if ((flags & LK_INTERLOCK) != 0) ASSERT_VI_LOCKED(vp, __func__); else ASSERT_VI_UNLOCKED(vp, __func__); VNPASS(vs == VGET_HOLDCNT || vs == VGET_USECOUNT, vp); VNPASS(vp->v_holdcnt > 0, vp); VNPASS(vs == VGET_HOLDCNT || vp->v_usecount > 0, vp); error = vn_lock(vp, flags); if (__predict_false(error != 0)) { vget_abort(vp, vs); CTR2(KTR_VFS, "%s: impossible to lock vnode %p", __func__, vp); return (error); } vget_finish_ref(vp, vs); return (0); } void vget_finish_ref(struct vnode *vp, enum vgetstate vs) { int old; VNPASS(vs == VGET_HOLDCNT || vs == VGET_USECOUNT, vp); VNPASS(vp->v_holdcnt > 0, vp); VNPASS(vs == VGET_HOLDCNT || vp->v_usecount > 0, vp); if (vs == VGET_USECOUNT) return; /* * We hold the vnode. If the usecount is 0 it will be utilized to keep * the vnode around. Otherwise someone else lended their hold count and * we have to drop ours. */ old = atomic_fetchadd_int(&vp->v_usecount, 1); VNASSERT(old >= 0, vp, ("%s: wrong use count %d", __func__, old)); if (old != 0) { #ifdef INVARIANTS old = atomic_fetchadd_int(&vp->v_holdcnt, -1); VNASSERT(old > 1, vp, ("%s: wrong hold count %d", __func__, old)); #else refcount_release(&vp->v_holdcnt); #endif } } void vref(struct vnode *vp) { enum vgetstate vs; CTR2(KTR_VFS, "%s: vp %p", __func__, vp); vs = vget_prep(vp); vget_finish_ref(vp, vs); } void vrefact(struct vnode *vp) { CTR2(KTR_VFS, "%s: vp %p", __func__, vp); #ifdef INVARIANTS int old = atomic_fetchadd_int(&vp->v_usecount, 1); VNASSERT(old > 0, vp, ("%s: wrong use count %d", __func__, old)); #else refcount_acquire(&vp->v_usecount); #endif } void vlazy(struct vnode *vp) { struct mount *mp; VNASSERT(vp->v_holdcnt > 0, vp, ("%s: vnode not held", __func__)); if ((vp->v_mflag & VMP_LAZYLIST) != 0) return; /* * We may get here for inactive routines after the vnode got doomed. */ if (VN_IS_DOOMED(vp)) return; mp = vp->v_mount; mtx_lock(&mp->mnt_listmtx); if ((vp->v_mflag & VMP_LAZYLIST) == 0) { vp->v_mflag |= VMP_LAZYLIST; TAILQ_INSERT_TAIL(&mp->mnt_lazyvnodelist, vp, v_lazylist); mp->mnt_lazyvnodelistsize++; } mtx_unlock(&mp->mnt_listmtx); } /* * This routine is only meant to be called from vgonel prior to dooming * the vnode. */ static void vunlazy_gone(struct vnode *vp) { struct mount *mp; ASSERT_VOP_ELOCKED(vp, __func__); ASSERT_VI_LOCKED(vp, __func__); VNPASS(!VN_IS_DOOMED(vp), vp); if (vp->v_mflag & VMP_LAZYLIST) { mp = vp->v_mount; mtx_lock(&mp->mnt_listmtx); VNPASS(vp->v_mflag & VMP_LAZYLIST, vp); vp->v_mflag &= ~VMP_LAZYLIST; TAILQ_REMOVE(&mp->mnt_lazyvnodelist, vp, v_lazylist); mp->mnt_lazyvnodelistsize--; mtx_unlock(&mp->mnt_listmtx); } } static void vdefer_inactive(struct vnode *vp) { ASSERT_VI_LOCKED(vp, __func__); VNASSERT(vp->v_holdcnt > 0, vp, ("%s: vnode without hold count", __func__)); if (VN_IS_DOOMED(vp)) { vdropl(vp); return; } if (vp->v_iflag & VI_DEFINACT) { VNASSERT(vp->v_holdcnt > 1, vp, ("lost hold count")); vdropl(vp); return; } if (vp->v_usecount > 0) { vp->v_iflag &= ~VI_OWEINACT; vdropl(vp); return; } vlazy(vp); vp->v_iflag |= VI_DEFINACT; VI_UNLOCK(vp); counter_u64_add(deferred_inact, 1); } static void vdefer_inactive_unlocked(struct vnode *vp) { VI_LOCK(vp); if ((vp->v_iflag & VI_OWEINACT) == 0) { vdropl(vp); return; } vdefer_inactive(vp); } enum vput_op { VRELE, VPUT, VUNREF }; /* * Handle ->v_usecount transitioning to 0. * * By releasing the last usecount we take ownership of the hold count which * provides liveness of the vnode, meaning we have to vdrop. * * For all vnodes we may need to perform inactive processing. It requires an * exclusive lock on the vnode, while it is legal to call here with only a * shared lock (or no locks). If locking the vnode in an expected manner fails, * inactive processing gets deferred to the syncer. * * XXX Some filesystems pass in an exclusively locked vnode and strongly depend * on the lock being held all the way until VOP_INACTIVE. This in particular * happens with UFS which adds half-constructed vnodes to the hash, where they * can be found by other code. */ static void vput_final(struct vnode *vp, enum vput_op func) { int error; bool want_unlock; CTR2(KTR_VFS, "%s: vp %p", __func__, vp); VNPASS(vp->v_holdcnt > 0, vp); VI_LOCK(vp); /* * By the time we got here someone else might have transitioned * the count back to > 0. */ if (vp->v_usecount > 0) goto out; /* * If the vnode is doomed vgone already performed inactive processing * (if needed). */ if (VN_IS_DOOMED(vp)) goto out; if (__predict_true(VOP_NEED_INACTIVE(vp) == 0)) goto out; if (vp->v_iflag & VI_DOINGINACT) goto out; /* * Locking operations here will drop the interlock and possibly the * vnode lock, opening a window where the vnode can get doomed all the * while ->v_usecount is 0. Set VI_OWEINACT to let vgone know to * perform inactive. */ vp->v_iflag |= VI_OWEINACT; want_unlock = false; error = 0; switch (func) { case VRELE: switch (VOP_ISLOCKED(vp)) { case LK_EXCLUSIVE: break; case LK_EXCLOTHER: case 0: want_unlock = true; error = vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK); VI_LOCK(vp); break; default: /* * The lock has at least one sharer, but we have no way * to conclude whether this is us. Play it safe and * defer processing. */ error = EAGAIN; break; } break; case VPUT: want_unlock = true; if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE) { error = VOP_LOCK(vp, LK_UPGRADE | LK_INTERLOCK | LK_NOWAIT); VI_LOCK(vp); } break; case VUNREF: if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE) { error = VOP_LOCK(vp, LK_TRYUPGRADE | LK_INTERLOCK); VI_LOCK(vp); } break; } if (error == 0) { vinactive(vp); if (want_unlock) VOP_UNLOCK(vp); vdropl(vp); } else { vdefer_inactive(vp); } return; out: if (func == VPUT) VOP_UNLOCK(vp); vdropl(vp); } /* * Decrement ->v_usecount for a vnode. * * Releasing the last use count requires additional processing, see vput_final * above for details. * * Comment above each variant denotes lock state on entry and exit. */ /* * in: any * out: same as passed in */ void vrele(struct vnode *vp) { ASSERT_VI_UNLOCKED(vp, __func__); if (!refcount_release(&vp->v_usecount)) return; vput_final(vp, VRELE); } /* * in: locked * out: unlocked */ void vput(struct vnode *vp) { ASSERT_VOP_LOCKED(vp, __func__); ASSERT_VI_UNLOCKED(vp, __func__); if (!refcount_release(&vp->v_usecount)) { VOP_UNLOCK(vp); return; } vput_final(vp, VPUT); } /* * in: locked * out: locked */ void vunref(struct vnode *vp) { ASSERT_VOP_LOCKED(vp, __func__); ASSERT_VI_UNLOCKED(vp, __func__); if (!refcount_release(&vp->v_usecount)) return; vput_final(vp, VUNREF); } void vhold(struct vnode *vp) { struct vdbatch *vd; int old; CTR2(KTR_VFS, "%s: vp %p", __func__, vp); old = atomic_fetchadd_int(&vp->v_holdcnt, 1); VNASSERT(old >= 0 && (old & VHOLD_ALL_FLAGS) == 0, vp, ("%s: wrong hold count %d", __func__, old)); if (old != 0) return; critical_enter(); vd = DPCPU_PTR(vd); vd->freevnodes--; critical_exit(); } void vholdl(struct vnode *vp) { ASSERT_VI_LOCKED(vp, __func__); CTR2(KTR_VFS, "%s: vp %p", __func__, vp); vhold(vp); } void vholdnz(struct vnode *vp) { CTR2(KTR_VFS, "%s: vp %p", __func__, vp); #ifdef INVARIANTS int old = atomic_fetchadd_int(&vp->v_holdcnt, 1); VNASSERT(old > 0 && (old & VHOLD_ALL_FLAGS) == 0, vp, ("%s: wrong hold count %d", __func__, old)); #else atomic_add_int(&vp->v_holdcnt, 1); #endif } /* * Grab a hold count unless the vnode is freed. * * Only use this routine if vfs smr is the only protection you have against * freeing the vnode. * * The code loops trying to add a hold count as long as the VHOLD_NO_SMR flag * is not set. After the flag is set the vnode becomes immutable to anyone but * the thread which managed to set the flag. * * It may be tempting to replace the loop with: * count = atomic_fetchadd_int(&vp->v_holdcnt, 1); * if (count & VHOLD_NO_SMR) { * backpedal and error out; * } * * However, while this is more performant, it hinders debugging by eliminating * the previously mentioned invariant. */ bool vhold_smr(struct vnode *vp) { int count; VFS_SMR_ASSERT_ENTERED(); count = atomic_load_int(&vp->v_holdcnt); for (;;) { if (count & VHOLD_NO_SMR) { VNASSERT((count & ~VHOLD_NO_SMR) == 0, vp, ("non-zero hold count with flags %d\n", count)); return (false); } VNASSERT(count >= 0, vp, ("invalid hold count %d\n", count)); if (atomic_fcmpset_int(&vp->v_holdcnt, &count, count + 1)) return (true); } } static void __noinline vdbatch_process(struct vdbatch *vd) { struct vnode *vp; int i; mtx_assert(&vd->lock, MA_OWNED); MPASS(curthread->td_pinned > 0); MPASS(vd->index == VDBATCH_SIZE); mtx_lock(&vnode_list_mtx); critical_enter(); freevnodes += vd->freevnodes; for (i = 0; i < VDBATCH_SIZE; i++) { vp = vd->tab[i]; TAILQ_REMOVE(&vnode_list, vp, v_vnodelist); TAILQ_INSERT_TAIL(&vnode_list, vp, v_vnodelist); MPASS(vp->v_dbatchcpu != NOCPU); vp->v_dbatchcpu = NOCPU; } mtx_unlock(&vnode_list_mtx); vd->freevnodes = 0; bzero(vd->tab, sizeof(vd->tab)); vd->index = 0; critical_exit(); } static void vdbatch_enqueue(struct vnode *vp) { struct vdbatch *vd; ASSERT_VI_LOCKED(vp, __func__); VNASSERT(!VN_IS_DOOMED(vp), vp, ("%s: deferring requeue of a doomed vnode", __func__)); critical_enter(); vd = DPCPU_PTR(vd); vd->freevnodes++; if (vp->v_dbatchcpu != NOCPU) { VI_UNLOCK(vp); critical_exit(); return; } sched_pin(); critical_exit(); mtx_lock(&vd->lock); MPASS(vd->index < VDBATCH_SIZE); MPASS(vd->tab[vd->index] == NULL); /* * A hack: we depend on being pinned so that we know what to put in * ->v_dbatchcpu. */ vp->v_dbatchcpu = curcpu; vd->tab[vd->index] = vp; vd->index++; VI_UNLOCK(vp); if (vd->index == VDBATCH_SIZE) vdbatch_process(vd); mtx_unlock(&vd->lock); sched_unpin(); } /* * This routine must only be called for vnodes which are about to be * deallocated. Supporting dequeue for arbitrary vndoes would require * validating that the locked batch matches. */ static void vdbatch_dequeue(struct vnode *vp) { struct vdbatch *vd; int i; short cpu; VNASSERT(vp->v_type == VBAD || vp->v_type == VNON, vp, ("%s: called for a used vnode\n", __func__)); cpu = vp->v_dbatchcpu; if (cpu == NOCPU) return; vd = DPCPU_ID_PTR(cpu, vd); mtx_lock(&vd->lock); for (i = 0; i < vd->index; i++) { if (vd->tab[i] != vp) continue; vp->v_dbatchcpu = NOCPU; vd->index--; vd->tab[i] = vd->tab[vd->index]; vd->tab[vd->index] = NULL; break; } mtx_unlock(&vd->lock); /* * Either we dequeued the vnode above or the target CPU beat us to it. */ MPASS(vp->v_dbatchcpu == NOCPU); } /* * Drop the hold count of the vnode. If this is the last reference to * the vnode we place it on the free list unless it has been vgone'd * (marked VIRF_DOOMED) in which case we will free it. * * Because the vnode vm object keeps a hold reference on the vnode if * there is at least one resident non-cached page, the vnode cannot * leave the active list without the page cleanup done. */ static void vdrop_deactivate(struct vnode *vp) { struct mount *mp; ASSERT_VI_LOCKED(vp, __func__); /* * Mark a vnode as free: remove it from its active list * and put it up for recycling on the freelist. */ VNASSERT(!VN_IS_DOOMED(vp), vp, ("vdrop: returning doomed vnode")); VNASSERT(vp->v_op != NULL, vp, ("vdrop: vnode already reclaimed.")); VNASSERT((vp->v_iflag & VI_OWEINACT) == 0, vp, ("vnode with VI_OWEINACT set")); VNASSERT((vp->v_iflag & VI_DEFINACT) == 0, vp, ("vnode with VI_DEFINACT set")); if (vp->v_mflag & VMP_LAZYLIST) { mp = vp->v_mount; mtx_lock(&mp->mnt_listmtx); VNASSERT(vp->v_mflag & VMP_LAZYLIST, vp, ("lost VMP_LAZYLIST")); /* * Don't remove the vnode from the lazy list if another thread * has increased the hold count. It may have re-enqueued the * vnode to the lazy list and is now responsible for its * removal. */ if (vp->v_holdcnt == 0) { vp->v_mflag &= ~VMP_LAZYLIST; TAILQ_REMOVE(&mp->mnt_lazyvnodelist, vp, v_lazylist); mp->mnt_lazyvnodelistsize--; } mtx_unlock(&mp->mnt_listmtx); } vdbatch_enqueue(vp); } void vdrop(struct vnode *vp) { ASSERT_VI_UNLOCKED(vp, __func__); CTR2(KTR_VFS, "%s: vp %p", __func__, vp); if (refcount_release_if_not_last(&vp->v_holdcnt)) return; VI_LOCK(vp); vdropl(vp); } void vdropl(struct vnode *vp) { ASSERT_VI_LOCKED(vp, __func__); CTR2(KTR_VFS, "%s: vp %p", __func__, vp); if (!refcount_release(&vp->v_holdcnt)) { VI_UNLOCK(vp); return; } if (!VN_IS_DOOMED(vp)) { vdrop_deactivate(vp); /* * Also unlocks the interlock. We can't assert on it as we * released our hold and by now the vnode might have been * freed. */ return; } /* * Set the VHOLD_NO_SMR flag. * * We may be racing against vhold_smr. If they win we can just pretend * we never got this far, they will vdrop later. */ if (!atomic_cmpset_int(&vp->v_holdcnt, 0, VHOLD_NO_SMR)) { VI_UNLOCK(vp); /* * We lost the aforementioned race. Any subsequent access is * invalid as they might have managed to vdropl on their own. */ return; } freevnode(vp); } /* * Call VOP_INACTIVE on the vnode and manage the DOINGINACT and OWEINACT * flags. DOINGINACT prevents us from recursing in calls to vinactive. */ static void vinactivef(struct vnode *vp) { struct vm_object *obj; ASSERT_VOP_ELOCKED(vp, "vinactive"); ASSERT_VI_LOCKED(vp, "vinactive"); VNASSERT((vp->v_iflag & VI_DOINGINACT) == 0, vp, ("vinactive: recursed on VI_DOINGINACT")); CTR2(KTR_VFS, "%s: vp %p", __func__, vp); vp->v_iflag |= VI_DOINGINACT; vp->v_iflag &= ~VI_OWEINACT; VI_UNLOCK(vp); /* * Before moving off the active list, we must be sure that any * modified pages are converted into the vnode's dirty * buffers, since these will no longer be checked once the * vnode is on the inactive list. * * The write-out of the dirty pages is asynchronous. At the * point that VOP_INACTIVE() is called, there could still be * pending I/O and dirty pages in the object. */ if ((obj = vp->v_object) != NULL && (vp->v_vflag & VV_NOSYNC) == 0 && vm_object_mightbedirty(obj)) { VM_OBJECT_WLOCK(obj); vm_object_page_clean(obj, 0, 0, 0); VM_OBJECT_WUNLOCK(obj); } VOP_INACTIVE(vp, curthread); VI_LOCK(vp); VNASSERT(vp->v_iflag & VI_DOINGINACT, vp, ("vinactive: lost VI_DOINGINACT")); vp->v_iflag &= ~VI_DOINGINACT; } void vinactive(struct vnode *vp) { ASSERT_VOP_ELOCKED(vp, "vinactive"); ASSERT_VI_LOCKED(vp, "vinactive"); CTR2(KTR_VFS, "%s: vp %p", __func__, vp); if ((vp->v_iflag & VI_OWEINACT) == 0) return; if (vp->v_iflag & VI_DOINGINACT) return; if (vp->v_usecount > 0) { vp->v_iflag &= ~VI_OWEINACT; return; } vinactivef(vp); } /* * Remove any vnodes in the vnode table belonging to mount point mp. * * If FORCECLOSE is not specified, there should not be any active ones, * return error if any are found (nb: this is a user error, not a * system error). If FORCECLOSE is specified, detach any active vnodes * that are found. * * If WRITECLOSE is set, only flush out regular file vnodes open for * writing. * * SKIPSYSTEM causes any vnodes marked VV_SYSTEM to be skipped. * * `rootrefs' specifies the base reference count for the root vnode * of this filesystem. The root vnode is considered busy if its * v_usecount exceeds this value. On a successful return, vflush(, td) * will call vrele() on the root vnode exactly rootrefs times. * If the SKIPSYSTEM or WRITECLOSE flags are specified, rootrefs must * be zero. */ #ifdef DIAGNOSTIC static int busyprt = 0; /* print out busy vnodes */ SYSCTL_INT(_debug, OID_AUTO, busyprt, CTLFLAG_RW, &busyprt, 0, "Print out busy vnodes"); #endif int vflush(struct mount *mp, int rootrefs, int flags, struct thread *td) { struct vnode *vp, *mvp, *rootvp = NULL; struct vattr vattr; int busy = 0, error; CTR4(KTR_VFS, "%s: mp %p with rootrefs %d and flags %d", __func__, mp, rootrefs, flags); if (rootrefs > 0) { KASSERT((flags & (SKIPSYSTEM | WRITECLOSE)) == 0, ("vflush: bad args")); /* * Get the filesystem root vnode. We can vput() it * immediately, since with rootrefs > 0, it won't go away. */ if ((error = VFS_ROOT(mp, LK_EXCLUSIVE, &rootvp)) != 0) { CTR2(KTR_VFS, "%s: vfs_root lookup failed with %d", __func__, error); return (error); } vput(rootvp); } loop: MNT_VNODE_FOREACH_ALL(vp, mp, mvp) { vholdl(vp); error = vn_lock(vp, LK_INTERLOCK | LK_EXCLUSIVE); if (error) { vdrop(vp); MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp); goto loop; } /* * Skip over a vnodes marked VV_SYSTEM. */ if ((flags & SKIPSYSTEM) && (vp->v_vflag & VV_SYSTEM)) { VOP_UNLOCK(vp); vdrop(vp); continue; } /* * If WRITECLOSE is set, flush out unlinked but still open * files (even if open only for reading) and regular file * vnodes open for writing. */ if (flags & WRITECLOSE) { if (vp->v_object != NULL) { VM_OBJECT_WLOCK(vp->v_object); vm_object_page_clean(vp->v_object, 0, 0, 0); VM_OBJECT_WUNLOCK(vp->v_object); } error = VOP_FSYNC(vp, MNT_WAIT, td); if (error != 0) { VOP_UNLOCK(vp); vdrop(vp); MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp); return (error); } error = VOP_GETATTR(vp, &vattr, td->td_ucred); VI_LOCK(vp); if ((vp->v_type == VNON || (error == 0 && vattr.va_nlink > 0)) && (vp->v_writecount <= 0 || vp->v_type != VREG)) { VOP_UNLOCK(vp); vdropl(vp); continue; } } else VI_LOCK(vp); /* * With v_usecount == 0, all we need to do is clear out the * vnode data structures and we are done. * * If FORCECLOSE is set, forcibly close the vnode. */ if (vp->v_usecount == 0 || (flags & FORCECLOSE)) { vgonel(vp); } else { busy++; #ifdef DIAGNOSTIC if (busyprt) vn_printf(vp, "vflush: busy vnode "); #endif } VOP_UNLOCK(vp); vdropl(vp); } if (rootrefs > 0 && (flags & FORCECLOSE) == 0) { /* * If just the root vnode is busy, and if its refcount * is equal to `rootrefs', then go ahead and kill it. */ VI_LOCK(rootvp); KASSERT(busy > 0, ("vflush: not busy")); VNASSERT(rootvp->v_usecount >= rootrefs, rootvp, ("vflush: usecount %d < rootrefs %d", rootvp->v_usecount, rootrefs)); if (busy == 1 && rootvp->v_usecount == rootrefs) { VOP_LOCK(rootvp, LK_EXCLUSIVE|LK_INTERLOCK); vgone(rootvp); VOP_UNLOCK(rootvp); busy = 0; } else VI_UNLOCK(rootvp); } if (busy) { CTR2(KTR_VFS, "%s: failing as %d vnodes are busy", __func__, busy); return (EBUSY); } for (; rootrefs > 0; rootrefs--) vrele(rootvp); return (0); } /* * Recycle an unused vnode to the front of the free list. */ int vrecycle(struct vnode *vp) { int recycled; VI_LOCK(vp); recycled = vrecyclel(vp); VI_UNLOCK(vp); return (recycled); } /* * vrecycle, with the vp interlock held. */ int vrecyclel(struct vnode *vp) { int recycled; ASSERT_VOP_ELOCKED(vp, __func__); ASSERT_VI_LOCKED(vp, __func__); CTR2(KTR_VFS, "%s: vp %p", __func__, vp); recycled = 0; if (vp->v_usecount == 0) { recycled = 1; vgonel(vp); } return (recycled); } /* * Eliminate all activity associated with a vnode * in preparation for reuse. */ void vgone(struct vnode *vp) { VI_LOCK(vp); vgonel(vp); VI_UNLOCK(vp); } static void notify_lowervp_vfs_dummy(struct mount *mp __unused, struct vnode *lowervp __unused) { } /* * Notify upper mounts about reclaimed or unlinked vnode. */ void vfs_notify_upper(struct vnode *vp, int event) { static struct vfsops vgonel_vfsops = { .vfs_reclaim_lowervp = notify_lowervp_vfs_dummy, .vfs_unlink_lowervp = notify_lowervp_vfs_dummy, }; struct mount *mp, *ump, *mmp; mp = vp->v_mount; if (mp == NULL) return; if (TAILQ_EMPTY(&mp->mnt_uppers)) return; mmp = malloc(sizeof(struct mount), M_TEMP, M_WAITOK | M_ZERO); mmp->mnt_op = &vgonel_vfsops; mmp->mnt_kern_flag |= MNTK_MARKER; MNT_ILOCK(mp); mp->mnt_kern_flag |= MNTK_VGONE_UPPER; for (ump = TAILQ_FIRST(&mp->mnt_uppers); ump != NULL;) { if ((ump->mnt_kern_flag & MNTK_MARKER) != 0) { ump = TAILQ_NEXT(ump, mnt_upper_link); continue; } TAILQ_INSERT_AFTER(&mp->mnt_uppers, ump, mmp, mnt_upper_link); MNT_IUNLOCK(mp); switch (event) { case VFS_NOTIFY_UPPER_RECLAIM: VFS_RECLAIM_LOWERVP(ump, vp); break; case VFS_NOTIFY_UPPER_UNLINK: VFS_UNLINK_LOWERVP(ump, vp); break; default: KASSERT(0, ("invalid event %d", event)); break; } MNT_ILOCK(mp); ump = TAILQ_NEXT(mmp, mnt_upper_link); TAILQ_REMOVE(&mp->mnt_uppers, mmp, mnt_upper_link); } free(mmp, M_TEMP); mp->mnt_kern_flag &= ~MNTK_VGONE_UPPER; if ((mp->mnt_kern_flag & MNTK_VGONE_WAITER) != 0) { mp->mnt_kern_flag &= ~MNTK_VGONE_WAITER; wakeup(&mp->mnt_uppers); } MNT_IUNLOCK(mp); } /* * vgone, with the vp interlock held. */ static void vgonel(struct vnode *vp) { struct thread *td; struct mount *mp; vm_object_t object; bool active, oweinact; ASSERT_VOP_ELOCKED(vp, "vgonel"); ASSERT_VI_LOCKED(vp, "vgonel"); VNASSERT(vp->v_holdcnt, vp, ("vgonel: vp %p has no reference.", vp)); CTR2(KTR_VFS, "%s: vp %p", __func__, vp); td = curthread; /* * Don't vgonel if we're already doomed. */ if (vp->v_irflag & VIRF_DOOMED) return; /* * Paired with freevnode. */ vn_seqc_write_begin_locked(vp); vunlazy_gone(vp); vp->v_irflag |= VIRF_DOOMED; /* * Check to see if the vnode is in use. If so, we have to call * VOP_CLOSE() and VOP_INACTIVE(). */ active = vp->v_usecount > 0; oweinact = (vp->v_iflag & VI_OWEINACT) != 0; /* * If we need to do inactive VI_OWEINACT will be set. */ if (vp->v_iflag & VI_DEFINACT) { VNASSERT(vp->v_holdcnt > 1, vp, ("lost hold count")); vp->v_iflag &= ~VI_DEFINACT; vdropl(vp); } else { VNASSERT(vp->v_holdcnt > 0, vp, ("vnode without hold count")); VI_UNLOCK(vp); } vfs_notify_upper(vp, VFS_NOTIFY_UPPER_RECLAIM); /* * If purging an active vnode, it must be closed and * deactivated before being reclaimed. */ if (active) VOP_CLOSE(vp, FNONBLOCK, NOCRED, td); if (oweinact || active) { VI_LOCK(vp); vinactivef(vp); VI_UNLOCK(vp); } if (vp->v_type == VSOCK) vfs_unp_reclaim(vp); /* * Clean out any buffers associated with the vnode. * If the flush fails, just toss the buffers. */ mp = NULL; if (!TAILQ_EMPTY(&vp->v_bufobj.bo_dirty.bv_hd)) (void) vn_start_secondary_write(vp, &mp, V_WAIT); if (vinvalbuf(vp, V_SAVE, 0, 0) != 0) { while (vinvalbuf(vp, 0, 0, 0) != 0) ; } BO_LOCK(&vp->v_bufobj); KASSERT(TAILQ_EMPTY(&vp->v_bufobj.bo_dirty.bv_hd) && vp->v_bufobj.bo_dirty.bv_cnt == 0 && TAILQ_EMPTY(&vp->v_bufobj.bo_clean.bv_hd) && vp->v_bufobj.bo_clean.bv_cnt == 0, ("vp %p bufobj not invalidated", vp)); /* * For VMIO bufobj, BO_DEAD is set later, or in * vm_object_terminate() after the object's page queue is * flushed. */ object = vp->v_bufobj.bo_object; if (object == NULL) vp->v_bufobj.bo_flag |= BO_DEAD; BO_UNLOCK(&vp->v_bufobj); /* * Handle the VM part. Tmpfs handles v_object on its own (the * OBJT_VNODE check). Nullfs or other bypassing filesystems * should not touch the object borrowed from the lower vnode * (the handle check). */ if (object != NULL && object->type == OBJT_VNODE && object->handle == vp) vnode_destroy_vobject(vp); /* * Reclaim the vnode. */ if (VOP_RECLAIM(vp, td)) panic("vgone: cannot reclaim"); if (mp != NULL) vn_finished_secondary_write(mp); VNASSERT(vp->v_object == NULL, vp, ("vop_reclaim left v_object vp=%p", vp)); /* * Clear the advisory locks and wake up waiting threads. */ (void)VOP_ADVLOCKPURGE(vp); vp->v_lockf = NULL; /* * Delete from old mount point vnode list. */ delmntque(vp); cache_purge_vgone(vp); /* * Done with purge, reset to the standard lock and invalidate * the vnode. */ VI_LOCK(vp); vp->v_vnlock = &vp->v_lock; vp->v_op = &dead_vnodeops; vp->v_type = VBAD; } /* * Print out a description of a vnode. */ static const char * const typename[] = {"VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD", "VMARKER"}; _Static_assert((VHOLD_ALL_FLAGS & ~VHOLD_NO_SMR) == 0, "new hold count flag not added to vn_printf"); void vn_printf(struct vnode *vp, const char *fmt, ...) { va_list ap; char buf[256], buf2[16]; u_long flags; u_int holdcnt; va_start(ap, fmt); vprintf(fmt, ap); va_end(ap); printf("%p: ", (void *)vp); printf("type %s\n", typename[vp->v_type]); holdcnt = atomic_load_int(&vp->v_holdcnt); printf(" usecount %d, writecount %d, refcount %d seqc users %d", vp->v_usecount, vp->v_writecount, holdcnt & ~VHOLD_ALL_FLAGS, vp->v_seqc_users); switch (vp->v_type) { case VDIR: printf(" mountedhere %p\n", vp->v_mountedhere); break; case VCHR: printf(" rdev %p\n", vp->v_rdev); break; case VSOCK: printf(" socket %p\n", vp->v_unpcb); break; case VFIFO: printf(" fifoinfo %p\n", vp->v_fifoinfo); break; default: printf("\n"); break; } buf[0] = '\0'; buf[1] = '\0'; if (holdcnt & VHOLD_NO_SMR) strlcat(buf, "|VHOLD_NO_SMR", sizeof(buf)); printf(" hold count flags (%s)\n", buf + 1); buf[0] = '\0'; buf[1] = '\0'; if (vp->v_irflag & VIRF_DOOMED) strlcat(buf, "|VIRF_DOOMED", sizeof(buf)); if (vp->v_irflag & VIRF_PGREAD) strlcat(buf, "|VIRF_PGREAD", sizeof(buf)); flags = vp->v_irflag & ~(VIRF_DOOMED | VIRF_PGREAD); if (flags != 0) { snprintf(buf2, sizeof(buf2), "|VIRF(0x%lx)", flags); strlcat(buf, buf2, sizeof(buf)); } if (vp->v_vflag & VV_ROOT) strlcat(buf, "|VV_ROOT", sizeof(buf)); if (vp->v_vflag & VV_ISTTY) strlcat(buf, "|VV_ISTTY", sizeof(buf)); if (vp->v_vflag & VV_NOSYNC) strlcat(buf, "|VV_NOSYNC", sizeof(buf)); if (vp->v_vflag & VV_ETERNALDEV) strlcat(buf, "|VV_ETERNALDEV", sizeof(buf)); if (vp->v_vflag & VV_CACHEDLABEL) strlcat(buf, "|VV_CACHEDLABEL", sizeof(buf)); if (vp->v_vflag & VV_VMSIZEVNLOCK) strlcat(buf, "|VV_VMSIZEVNLOCK", sizeof(buf)); if (vp->v_vflag & VV_COPYONWRITE) strlcat(buf, "|VV_COPYONWRITE", sizeof(buf)); if (vp->v_vflag & VV_SYSTEM) strlcat(buf, "|VV_SYSTEM", sizeof(buf)); if (vp->v_vflag & VV_PROCDEP) strlcat(buf, "|VV_PROCDEP", sizeof(buf)); if (vp->v_vflag & VV_NOKNOTE) strlcat(buf, "|VV_NOKNOTE", sizeof(buf)); if (vp->v_vflag & VV_DELETED) strlcat(buf, "|VV_DELETED", sizeof(buf)); if (vp->v_vflag & VV_MD) strlcat(buf, "|VV_MD", sizeof(buf)); if (vp->v_vflag & VV_FORCEINSMQ) strlcat(buf, "|VV_FORCEINSMQ", sizeof(buf)); if (vp->v_vflag & VV_READLINK) strlcat(buf, "|VV_READLINK", sizeof(buf)); flags = vp->v_vflag & ~(VV_ROOT | VV_ISTTY | VV_NOSYNC | VV_ETERNALDEV | VV_CACHEDLABEL | VV_COPYONWRITE | VV_SYSTEM | VV_PROCDEP | VV_NOKNOTE | VV_DELETED | VV_MD | VV_FORCEINSMQ); if (flags != 0) { snprintf(buf2, sizeof(buf2), "|VV(0x%lx)", flags); strlcat(buf, buf2, sizeof(buf)); } if (vp->v_iflag & VI_TEXT_REF) strlcat(buf, "|VI_TEXT_REF", sizeof(buf)); if (vp->v_iflag & VI_MOUNT) strlcat(buf, "|VI_MOUNT", sizeof(buf)); if (vp->v_iflag & VI_DOINGINACT) strlcat(buf, "|VI_DOINGINACT", sizeof(buf)); if (vp->v_iflag & VI_OWEINACT) strlcat(buf, "|VI_OWEINACT", sizeof(buf)); if (vp->v_iflag & VI_DEFINACT) strlcat(buf, "|VI_DEFINACT", sizeof(buf)); flags = vp->v_iflag & ~(VI_TEXT_REF | VI_MOUNT | VI_DOINGINACT | VI_OWEINACT | VI_DEFINACT); if (flags != 0) { snprintf(buf2, sizeof(buf2), "|VI(0x%lx)", flags); strlcat(buf, buf2, sizeof(buf)); } if (vp->v_mflag & VMP_LAZYLIST) strlcat(buf, "|VMP_LAZYLIST", sizeof(buf)); flags = vp->v_mflag & ~(VMP_LAZYLIST); if (flags != 0) { snprintf(buf2, sizeof(buf2), "|VMP(0x%lx)", flags); strlcat(buf, buf2, sizeof(buf)); } printf(" flags (%s)\n", buf + 1); if (mtx_owned(VI_MTX(vp))) printf(" VI_LOCKed"); if (vp->v_object != NULL) printf(" v_object %p ref %d pages %d " "cleanbuf %d dirtybuf %d\n", vp->v_object, vp->v_object->ref_count, vp->v_object->resident_page_count, vp->v_bufobj.bo_clean.bv_cnt, vp->v_bufobj.bo_dirty.bv_cnt); printf(" "); lockmgr_printinfo(vp->v_vnlock); if (vp->v_data != NULL) VOP_PRINT(vp); } #ifdef DDB /* * List all of the locked vnodes in the system. * Called when debugging the kernel. */ DB_SHOW_COMMAND(lockedvnods, lockedvnodes) { struct mount *mp; struct vnode *vp; /* * Note: because this is DDB, we can't obey the locking semantics * for these structures, which means we could catch an inconsistent * state and dereference a nasty pointer. Not much to be done * about that. */ db_printf("Locked vnodes\n"); TAILQ_FOREACH(mp, &mountlist, mnt_list) { TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) { if (vp->v_type != VMARKER && VOP_ISLOCKED(vp)) vn_printf(vp, "vnode "); } } } /* * Show details about the given vnode. */ DB_SHOW_COMMAND(vnode, db_show_vnode) { struct vnode *vp; if (!have_addr) return; vp = (struct vnode *)addr; vn_printf(vp, "vnode "); } /* * Show details about the given mount point. */ DB_SHOW_COMMAND(mount, db_show_mount) { struct mount *mp; struct vfsopt *opt; struct statfs *sp; struct vnode *vp; char buf[512]; uint64_t mflags; u_int flags; if (!have_addr) { /* No address given, print short info about all mount points. */ TAILQ_FOREACH(mp, &mountlist, mnt_list) { db_printf("%p %s on %s (%s)\n", mp, mp->mnt_stat.f_mntfromname, mp->mnt_stat.f_mntonname, mp->mnt_stat.f_fstypename); if (db_pager_quit) break; } db_printf("\nMore info: show mount \n"); return; } mp = (struct mount *)addr; db_printf("%p %s on %s (%s)\n", mp, mp->mnt_stat.f_mntfromname, mp->mnt_stat.f_mntonname, mp->mnt_stat.f_fstypename); buf[0] = '\0'; mflags = mp->mnt_flag; #define MNT_FLAG(flag) do { \ if (mflags & (flag)) { \ if (buf[0] != '\0') \ strlcat(buf, ", ", sizeof(buf)); \ strlcat(buf, (#flag) + 4, sizeof(buf)); \ mflags &= ~(flag); \ } \ } while (0) MNT_FLAG(MNT_RDONLY); MNT_FLAG(MNT_SYNCHRONOUS); MNT_FLAG(MNT_NOEXEC); MNT_FLAG(MNT_NOSUID); MNT_FLAG(MNT_NFS4ACLS); MNT_FLAG(MNT_UNION); MNT_FLAG(MNT_ASYNC); MNT_FLAG(MNT_SUIDDIR); MNT_FLAG(MNT_SOFTDEP); MNT_FLAG(MNT_NOSYMFOLLOW); MNT_FLAG(MNT_GJOURNAL); MNT_FLAG(MNT_MULTILABEL); MNT_FLAG(MNT_ACLS); MNT_FLAG(MNT_NOATIME); MNT_FLAG(MNT_NOCLUSTERR); MNT_FLAG(MNT_NOCLUSTERW); MNT_FLAG(MNT_SUJ); MNT_FLAG(MNT_EXRDONLY); MNT_FLAG(MNT_EXPORTED); MNT_FLAG(MNT_DEFEXPORTED); MNT_FLAG(MNT_EXPORTANON); MNT_FLAG(MNT_EXKERB); MNT_FLAG(MNT_EXPUBLIC); MNT_FLAG(MNT_LOCAL); MNT_FLAG(MNT_QUOTA); MNT_FLAG(MNT_ROOTFS); MNT_FLAG(MNT_USER); MNT_FLAG(MNT_IGNORE); MNT_FLAG(MNT_UPDATE); MNT_FLAG(MNT_DELEXPORT); MNT_FLAG(MNT_RELOAD); MNT_FLAG(MNT_FORCE); MNT_FLAG(MNT_SNAPSHOT); MNT_FLAG(MNT_BYFSID); #undef MNT_FLAG if (mflags != 0) { if (buf[0] != '\0') strlcat(buf, ", ", sizeof(buf)); snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), "0x%016jx", mflags); } db_printf(" mnt_flag = %s\n", buf); buf[0] = '\0'; flags = mp->mnt_kern_flag; #define MNT_KERN_FLAG(flag) do { \ if (flags & (flag)) { \ if (buf[0] != '\0') \ strlcat(buf, ", ", sizeof(buf)); \ strlcat(buf, (#flag) + 5, sizeof(buf)); \ flags &= ~(flag); \ } \ } while (0) MNT_KERN_FLAG(MNTK_UNMOUNTF); MNT_KERN_FLAG(MNTK_ASYNC); MNT_KERN_FLAG(MNTK_SOFTDEP); MNT_KERN_FLAG(MNTK_DRAINING); MNT_KERN_FLAG(MNTK_REFEXPIRE); MNT_KERN_FLAG(MNTK_EXTENDED_SHARED); MNT_KERN_FLAG(MNTK_SHARED_WRITES); MNT_KERN_FLAG(MNTK_NO_IOPF); MNT_KERN_FLAG(MNTK_VGONE_UPPER); MNT_KERN_FLAG(MNTK_VGONE_WAITER); MNT_KERN_FLAG(MNTK_LOOKUP_EXCL_DOTDOT); MNT_KERN_FLAG(MNTK_MARKER); MNT_KERN_FLAG(MNTK_USES_BCACHE); MNT_KERN_FLAG(MNTK_FPLOOKUP); MNT_KERN_FLAG(MNTK_NOASYNC); MNT_KERN_FLAG(MNTK_UNMOUNT); MNT_KERN_FLAG(MNTK_MWAIT); MNT_KERN_FLAG(MNTK_SUSPEND); MNT_KERN_FLAG(MNTK_SUSPEND2); MNT_KERN_FLAG(MNTK_SUSPENDED); MNT_KERN_FLAG(MNTK_LOOKUP_SHARED); MNT_KERN_FLAG(MNTK_NOKNOTE); #undef MNT_KERN_FLAG if (flags != 0) { if (buf[0] != '\0') strlcat(buf, ", ", sizeof(buf)); snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), "0x%08x", flags); } db_printf(" mnt_kern_flag = %s\n", buf); db_printf(" mnt_opt = "); opt = TAILQ_FIRST(mp->mnt_opt); if (opt != NULL) { db_printf("%s", opt->name); opt = TAILQ_NEXT(opt, link); while (opt != NULL) { db_printf(", %s", opt->name); opt = TAILQ_NEXT(opt, link); } } db_printf("\n"); sp = &mp->mnt_stat; db_printf(" mnt_stat = { version=%u type=%u flags=0x%016jx " "bsize=%ju iosize=%ju blocks=%ju bfree=%ju bavail=%jd files=%ju " "ffree=%jd syncwrites=%ju asyncwrites=%ju syncreads=%ju " "asyncreads=%ju namemax=%u owner=%u fsid=[%d, %d] }\n", (u_int)sp->f_version, (u_int)sp->f_type, (uintmax_t)sp->f_flags, (uintmax_t)sp->f_bsize, (uintmax_t)sp->f_iosize, (uintmax_t)sp->f_blocks, (uintmax_t)sp->f_bfree, (intmax_t)sp->f_bavail, (uintmax_t)sp->f_files, (intmax_t)sp->f_ffree, (uintmax_t)sp->f_syncwrites, (uintmax_t)sp->f_asyncwrites, (uintmax_t)sp->f_syncreads, (uintmax_t)sp->f_asyncreads, (u_int)sp->f_namemax, (u_int)sp->f_owner, (int)sp->f_fsid.val[0], (int)sp->f_fsid.val[1]); db_printf(" mnt_cred = { uid=%u ruid=%u", (u_int)mp->mnt_cred->cr_uid, (u_int)mp->mnt_cred->cr_ruid); if (jailed(mp->mnt_cred)) db_printf(", jail=%d", mp->mnt_cred->cr_prison->pr_id); db_printf(" }\n"); db_printf(" mnt_ref = %d (with %d in the struct)\n", vfs_mount_fetch_counter(mp, MNT_COUNT_REF), mp->mnt_ref); db_printf(" mnt_gen = %d\n", mp->mnt_gen); db_printf(" mnt_nvnodelistsize = %d\n", mp->mnt_nvnodelistsize); db_printf(" mnt_lazyvnodelistsize = %d\n", mp->mnt_lazyvnodelistsize); db_printf(" mnt_writeopcount = %d (with %d in the struct)\n", vfs_mount_fetch_counter(mp, MNT_COUNT_WRITEOPCOUNT), mp->mnt_writeopcount); db_printf(" mnt_maxsymlinklen = %d\n", mp->mnt_maxsymlinklen); db_printf(" mnt_iosize_max = %d\n", mp->mnt_iosize_max); db_printf(" mnt_hashseed = %u\n", mp->mnt_hashseed); db_printf(" mnt_lockref = %d (with %d in the struct)\n", vfs_mount_fetch_counter(mp, MNT_COUNT_LOCKREF), mp->mnt_lockref); db_printf(" mnt_secondary_writes = %d\n", mp->mnt_secondary_writes); db_printf(" mnt_secondary_accwrites = %d\n", mp->mnt_secondary_accwrites); db_printf(" mnt_gjprovider = %s\n", mp->mnt_gjprovider != NULL ? mp->mnt_gjprovider : "NULL"); db_printf(" mnt_vfs_ops = %d\n", mp->mnt_vfs_ops); db_printf("\n\nList of active vnodes\n"); TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) { if (vp->v_type != VMARKER && vp->v_holdcnt > 0) { vn_printf(vp, "vnode "); if (db_pager_quit) break; } } db_printf("\n\nList of inactive vnodes\n"); TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) { if (vp->v_type != VMARKER && vp->v_holdcnt == 0) { vn_printf(vp, "vnode "); if (db_pager_quit) break; } } } #endif /* DDB */ /* * Fill in a struct xvfsconf based on a struct vfsconf. */ static int vfsconf2x(struct sysctl_req *req, struct vfsconf *vfsp) { struct xvfsconf xvfsp; bzero(&xvfsp, sizeof(xvfsp)); strcpy(xvfsp.vfc_name, vfsp->vfc_name); xvfsp.vfc_typenum = vfsp->vfc_typenum; xvfsp.vfc_refcount = vfsp->vfc_refcount; xvfsp.vfc_flags = vfsp->vfc_flags; /* * These are unused in userland, we keep them * to not break binary compatibility. */ xvfsp.vfc_vfsops = NULL; xvfsp.vfc_next = NULL; return (SYSCTL_OUT(req, &xvfsp, sizeof(xvfsp))); } #ifdef COMPAT_FREEBSD32 struct xvfsconf32 { uint32_t vfc_vfsops; char vfc_name[MFSNAMELEN]; int32_t vfc_typenum; int32_t vfc_refcount; int32_t vfc_flags; uint32_t vfc_next; }; static int vfsconf2x32(struct sysctl_req *req, struct vfsconf *vfsp) { struct xvfsconf32 xvfsp; bzero(&xvfsp, sizeof(xvfsp)); strcpy(xvfsp.vfc_name, vfsp->vfc_name); xvfsp.vfc_typenum = vfsp->vfc_typenum; xvfsp.vfc_refcount = vfsp->vfc_refcount; xvfsp.vfc_flags = vfsp->vfc_flags; return (SYSCTL_OUT(req, &xvfsp, sizeof(xvfsp))); } #endif /* * Top level filesystem related information gathering. */ static int sysctl_vfs_conflist(SYSCTL_HANDLER_ARGS) { struct vfsconf *vfsp; int error; error = 0; vfsconf_slock(); TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) { #ifdef COMPAT_FREEBSD32 if (req->flags & SCTL_MASK32) error = vfsconf2x32(req, vfsp); else #endif error = vfsconf2x(req, vfsp); if (error) break; } vfsconf_sunlock(); return (error); } SYSCTL_PROC(_vfs, OID_AUTO, conflist, CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, sysctl_vfs_conflist, "S,xvfsconf", "List of all configured filesystems"); #ifndef BURN_BRIDGES static int sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS); static int vfs_sysctl(SYSCTL_HANDLER_ARGS) { int *name = (int *)arg1 - 1; /* XXX */ u_int namelen = arg2 + 1; /* XXX */ struct vfsconf *vfsp; log(LOG_WARNING, "userland calling deprecated sysctl, " "please rebuild world\n"); #if 1 || defined(COMPAT_PRELITE2) /* Resolve ambiguity between VFS_VFSCONF and VFS_GENERIC. */ if (namelen == 1) return (sysctl_ovfs_conf(oidp, arg1, arg2, req)); #endif switch (name[1]) { case VFS_MAXTYPENUM: if (namelen != 2) return (ENOTDIR); return (SYSCTL_OUT(req, &maxvfsconf, sizeof(int))); case VFS_CONF: if (namelen != 3) return (ENOTDIR); /* overloaded */ vfsconf_slock(); TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) { if (vfsp->vfc_typenum == name[2]) break; } vfsconf_sunlock(); if (vfsp == NULL) return (EOPNOTSUPP); #ifdef COMPAT_FREEBSD32 if (req->flags & SCTL_MASK32) return (vfsconf2x32(req, vfsp)); else #endif return (vfsconf2x(req, vfsp)); } return (EOPNOTSUPP); } static SYSCTL_NODE(_vfs, VFS_GENERIC, generic, CTLFLAG_RD | CTLFLAG_SKIP | CTLFLAG_MPSAFE, vfs_sysctl, "Generic filesystem"); #if 1 || defined(COMPAT_PRELITE2) static int sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS) { int error; struct vfsconf *vfsp; struct ovfsconf ovfs; vfsconf_slock(); TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) { bzero(&ovfs, sizeof(ovfs)); ovfs.vfc_vfsops = vfsp->vfc_vfsops; /* XXX used as flag */ strcpy(ovfs.vfc_name, vfsp->vfc_name); ovfs.vfc_index = vfsp->vfc_typenum; ovfs.vfc_refcount = vfsp->vfc_refcount; ovfs.vfc_flags = vfsp->vfc_flags; error = SYSCTL_OUT(req, &ovfs, sizeof ovfs); if (error != 0) { vfsconf_sunlock(); return (error); } } vfsconf_sunlock(); return (0); } #endif /* 1 || COMPAT_PRELITE2 */ #endif /* !BURN_BRIDGES */ #define KINFO_VNODESLOP 10 #ifdef notyet /* * Dump vnode list (via sysctl). */ /* ARGSUSED */ static int sysctl_vnode(SYSCTL_HANDLER_ARGS) { struct xvnode *xvn; struct mount *mp; struct vnode *vp; int error, len, n; /* * Stale numvnodes access is not fatal here. */ req->lock = 0; len = (numvnodes + KINFO_VNODESLOP) * sizeof *xvn; if (!req->oldptr) /* Make an estimate */ return (SYSCTL_OUT(req, 0, len)); error = sysctl_wire_old_buffer(req, 0); if (error != 0) return (error); xvn = malloc(len, M_TEMP, M_ZERO | M_WAITOK); n = 0; mtx_lock(&mountlist_mtx); TAILQ_FOREACH(mp, &mountlist, mnt_list) { if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK)) continue; MNT_ILOCK(mp); TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) { if (n == len) break; vref(vp); xvn[n].xv_size = sizeof *xvn; xvn[n].xv_vnode = vp; xvn[n].xv_id = 0; /* XXX compat */ #define XV_COPY(field) xvn[n].xv_##field = vp->v_##field XV_COPY(usecount); XV_COPY(writecount); XV_COPY(holdcnt); XV_COPY(mount); XV_COPY(numoutput); XV_COPY(type); #undef XV_COPY xvn[n].xv_flag = vp->v_vflag; switch (vp->v_type) { case VREG: case VDIR: case VLNK: break; case VBLK: case VCHR: if (vp->v_rdev == NULL) { vrele(vp); continue; } xvn[n].xv_dev = dev2udev(vp->v_rdev); break; case VSOCK: xvn[n].xv_socket = vp->v_socket; break; case VFIFO: xvn[n].xv_fifo = vp->v_fifoinfo; break; case VNON: case VBAD: default: /* shouldn't happen? */ vrele(vp); continue; } vrele(vp); ++n; } MNT_IUNLOCK(mp); mtx_lock(&mountlist_mtx); vfs_unbusy(mp); if (n == len) break; } mtx_unlock(&mountlist_mtx); error = SYSCTL_OUT(req, xvn, n * sizeof *xvn); free(xvn, M_TEMP); return (error); } SYSCTL_PROC(_kern, KERN_VNODE, vnode, CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 0, sysctl_vnode, "S,xvnode", ""); #endif static void unmount_or_warn(struct mount *mp) { int error; error = dounmount(mp, MNT_FORCE, curthread); if (error != 0) { printf("unmount of %s failed (", mp->mnt_stat.f_mntonname); if (error == EBUSY) printf("BUSY)\n"); else printf("%d)\n", error); } } /* * Unmount all filesystems. The list is traversed in reverse order * of mounting to avoid dependencies. */ void vfs_unmountall(void) { struct mount *mp, *tmp; CTR1(KTR_VFS, "%s: unmounting all filesystems", __func__); /* * Since this only runs when rebooting, it is not interlocked. */ TAILQ_FOREACH_REVERSE_SAFE(mp, &mountlist, mntlist, mnt_list, tmp) { vfs_ref(mp); /* * Forcibly unmounting "/dev" before "/" would prevent clean * unmount of the latter. */ if (mp == rootdevmp) continue; unmount_or_warn(mp); } if (rootdevmp != NULL) unmount_or_warn(rootdevmp); } static void vfs_deferred_inactive(struct vnode *vp, int lkflags) { ASSERT_VI_LOCKED(vp, __func__); VNASSERT((vp->v_iflag & VI_DEFINACT) == 0, vp, ("VI_DEFINACT still set")); if ((vp->v_iflag & VI_OWEINACT) == 0) { vdropl(vp); return; } if (vn_lock(vp, lkflags) == 0) { VI_LOCK(vp); vinactive(vp); VOP_UNLOCK(vp); vdropl(vp); return; } vdefer_inactive_unlocked(vp); } static int vfs_periodic_inactive_filter(struct vnode *vp, void *arg) { return (vp->v_iflag & VI_DEFINACT); } static void __noinline vfs_periodic_inactive(struct mount *mp, int flags) { struct vnode *vp, *mvp; int lkflags; lkflags = LK_EXCLUSIVE | LK_INTERLOCK; if (flags != MNT_WAIT) lkflags |= LK_NOWAIT; MNT_VNODE_FOREACH_LAZY(vp, mp, mvp, vfs_periodic_inactive_filter, NULL) { if ((vp->v_iflag & VI_DEFINACT) == 0) { VI_UNLOCK(vp); continue; } vp->v_iflag &= ~VI_DEFINACT; vfs_deferred_inactive(vp, lkflags); } } static inline bool vfs_want_msync(struct vnode *vp) { struct vm_object *obj; /* * This test may be performed without any locks held. * We rely on vm_object's type stability. */ if (vp->v_vflag & VV_NOSYNC) return (false); obj = vp->v_object; return (obj != NULL && vm_object_mightbedirty(obj)); } static int vfs_periodic_msync_inactive_filter(struct vnode *vp, void *arg __unused) { if (vp->v_vflag & VV_NOSYNC) return (false); if (vp->v_iflag & VI_DEFINACT) return (true); return (vfs_want_msync(vp)); } static void __noinline vfs_periodic_msync_inactive(struct mount *mp, int flags) { struct vnode *vp, *mvp; struct vm_object *obj; struct thread *td; int lkflags, objflags; bool seen_defer; td = curthread; lkflags = LK_EXCLUSIVE | LK_INTERLOCK; if (flags != MNT_WAIT) { lkflags |= LK_NOWAIT; objflags = OBJPC_NOSYNC; } else { objflags = OBJPC_SYNC; } MNT_VNODE_FOREACH_LAZY(vp, mp, mvp, vfs_periodic_msync_inactive_filter, NULL) { seen_defer = false; if (vp->v_iflag & VI_DEFINACT) { vp->v_iflag &= ~VI_DEFINACT; seen_defer = true; } if (!vfs_want_msync(vp)) { if (seen_defer) vfs_deferred_inactive(vp, lkflags); else VI_UNLOCK(vp); continue; } if (vget(vp, lkflags) == 0) { obj = vp->v_object; if (obj != NULL && (vp->v_vflag & VV_NOSYNC) == 0) { VM_OBJECT_WLOCK(obj); vm_object_page_clean(obj, 0, 0, objflags); VM_OBJECT_WUNLOCK(obj); } vput(vp); if (seen_defer) vdrop(vp); } else { if (seen_defer) vdefer_inactive_unlocked(vp); } } } void vfs_periodic(struct mount *mp, int flags) { CTR2(KTR_VFS, "%s: mp %p", __func__, mp); if ((mp->mnt_kern_flag & MNTK_NOMSYNC) != 0) vfs_periodic_inactive(mp, flags); else vfs_periodic_msync_inactive(mp, flags); } static void destroy_vpollinfo_free(struct vpollinfo *vi) { knlist_destroy(&vi->vpi_selinfo.si_note); mtx_destroy(&vi->vpi_lock); uma_zfree(vnodepoll_zone, vi); } static void destroy_vpollinfo(struct vpollinfo *vi) { knlist_clear(&vi->vpi_selinfo.si_note, 1); seldrain(&vi->vpi_selinfo); destroy_vpollinfo_free(vi); } /* * Initialize per-vnode helper structure to hold poll-related state. */ void v_addpollinfo(struct vnode *vp) { struct vpollinfo *vi; if (vp->v_pollinfo != NULL) return; vi = uma_zalloc(vnodepoll_zone, M_WAITOK | M_ZERO); mtx_init(&vi->vpi_lock, "vnode pollinfo", NULL, MTX_DEF); knlist_init(&vi->vpi_selinfo.si_note, vp, vfs_knllock, vfs_knlunlock, vfs_knl_assert_locked, vfs_knl_assert_unlocked); VI_LOCK(vp); if (vp->v_pollinfo != NULL) { VI_UNLOCK(vp); destroy_vpollinfo_free(vi); return; } vp->v_pollinfo = vi; VI_UNLOCK(vp); } /* * Record a process's interest in events which might happen to * a vnode. Because poll uses the historic select-style interface * internally, this routine serves as both the ``check for any * pending events'' and the ``record my interest in future events'' * functions. (These are done together, while the lock is held, * to avoid race conditions.) */ int vn_pollrecord(struct vnode *vp, struct thread *td, int events) { v_addpollinfo(vp); mtx_lock(&vp->v_pollinfo->vpi_lock); if (vp->v_pollinfo->vpi_revents & events) { /* * This leaves events we are not interested * in available for the other process which * which presumably had requested them * (otherwise they would never have been * recorded). */ events &= vp->v_pollinfo->vpi_revents; vp->v_pollinfo->vpi_revents &= ~events; mtx_unlock(&vp->v_pollinfo->vpi_lock); return (events); } vp->v_pollinfo->vpi_events |= events; selrecord(td, &vp->v_pollinfo->vpi_selinfo); mtx_unlock(&vp->v_pollinfo->vpi_lock); return (0); } /* * Routine to create and manage a filesystem syncer vnode. */ #define sync_close ((int (*)(struct vop_close_args *))nullop) static int sync_fsync(struct vop_fsync_args *); static int sync_inactive(struct vop_inactive_args *); static int sync_reclaim(struct vop_reclaim_args *); static struct vop_vector sync_vnodeops = { .vop_bypass = VOP_EOPNOTSUPP, .vop_close = sync_close, /* close */ .vop_fsync = sync_fsync, /* fsync */ .vop_inactive = sync_inactive, /* inactive */ .vop_need_inactive = vop_stdneed_inactive, /* need_inactive */ .vop_reclaim = sync_reclaim, /* reclaim */ .vop_lock1 = vop_stdlock, /* lock */ .vop_unlock = vop_stdunlock, /* unlock */ .vop_islocked = vop_stdislocked, /* islocked */ }; VFS_VOP_VECTOR_REGISTER(sync_vnodeops); /* * Create a new filesystem syncer vnode for the specified mount point. */ void vfs_allocate_syncvnode(struct mount *mp) { struct vnode *vp; struct bufobj *bo; static long start, incr, next; int error; /* Allocate a new vnode */ error = getnewvnode("syncer", mp, &sync_vnodeops, &vp); if (error != 0) panic("vfs_allocate_syncvnode: getnewvnode() failed"); vp->v_type = VNON; vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); vp->v_vflag |= VV_FORCEINSMQ; error = insmntque(vp, mp); if (error != 0) panic("vfs_allocate_syncvnode: insmntque() failed"); vp->v_vflag &= ~VV_FORCEINSMQ; VOP_UNLOCK(vp); /* * Place the vnode onto the syncer worklist. We attempt to * scatter them about on the list so that they will go off * at evenly distributed times even if all the filesystems * are mounted at once. */ next += incr; if (next == 0 || next > syncer_maxdelay) { start /= 2; incr /= 2; if (start == 0) { start = syncer_maxdelay / 2; incr = syncer_maxdelay; } next = start; } bo = &vp->v_bufobj; BO_LOCK(bo); vn_syncer_add_to_worklist(bo, syncdelay > 0 ? next % syncdelay : 0); /* XXX - vn_syncer_add_to_worklist() also grabs and drops sync_mtx. */ mtx_lock(&sync_mtx); sync_vnode_count++; if (mp->mnt_syncer == NULL) { mp->mnt_syncer = vp; vp = NULL; } mtx_unlock(&sync_mtx); BO_UNLOCK(bo); if (vp != NULL) { vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); vgone(vp); vput(vp); } } void vfs_deallocate_syncvnode(struct mount *mp) { struct vnode *vp; mtx_lock(&sync_mtx); vp = mp->mnt_syncer; if (vp != NULL) mp->mnt_syncer = NULL; mtx_unlock(&sync_mtx); if (vp != NULL) vrele(vp); } /* * Do a lazy sync of the filesystem. */ static int sync_fsync(struct vop_fsync_args *ap) { struct vnode *syncvp = ap->a_vp; struct mount *mp = syncvp->v_mount; int error, save; struct bufobj *bo; /* * We only need to do something if this is a lazy evaluation. */ if (ap->a_waitfor != MNT_LAZY) return (0); /* * Move ourselves to the back of the sync list. */ bo = &syncvp->v_bufobj; BO_LOCK(bo); vn_syncer_add_to_worklist(bo, syncdelay); BO_UNLOCK(bo); /* * Walk the list of vnodes pushing all that are dirty and * not already on the sync list. */ if (vfs_busy(mp, MBF_NOWAIT) != 0) return (0); if (vn_start_write(NULL, &mp, V_NOWAIT) != 0) { vfs_unbusy(mp); return (0); } save = curthread_pflags_set(TDP_SYNCIO); /* * The filesystem at hand may be idle with free vnodes stored in the * batch. Return them instead of letting them stay there indefinitely. */ vfs_periodic(mp, MNT_NOWAIT); error = VFS_SYNC(mp, MNT_LAZY); curthread_pflags_restore(save); vn_finished_write(mp); vfs_unbusy(mp); return (error); } /* * The syncer vnode is no referenced. */ static int sync_inactive(struct vop_inactive_args *ap) { vgone(ap->a_vp); return (0); } /* * The syncer vnode is no longer needed and is being decommissioned. * * Modifications to the worklist must be protected by sync_mtx. */ static int sync_reclaim(struct vop_reclaim_args *ap) { struct vnode *vp = ap->a_vp; struct bufobj *bo; bo = &vp->v_bufobj; BO_LOCK(bo); mtx_lock(&sync_mtx); if (vp->v_mount->mnt_syncer == vp) vp->v_mount->mnt_syncer = NULL; if (bo->bo_flag & BO_ONWORKLST) { LIST_REMOVE(bo, bo_synclist); syncer_worklist_len--; sync_vnode_count--; bo->bo_flag &= ~BO_ONWORKLST; } mtx_unlock(&sync_mtx); BO_UNLOCK(bo); return (0); } int vn_need_pageq_flush(struct vnode *vp) { struct vm_object *obj; int need; MPASS(mtx_owned(VI_MTX(vp))); need = 0; if ((obj = vp->v_object) != NULL && (vp->v_vflag & VV_NOSYNC) == 0 && vm_object_mightbedirty(obj)) need = 1; return (need); } /* * Check if vnode represents a disk device */ -int -vn_isdisk(struct vnode *vp, int *errp) +bool +vn_isdisk_error(struct vnode *vp, int *errp) { int error; if (vp->v_type != VCHR) { error = ENOTBLK; goto out; } error = 0; dev_lock(); if (vp->v_rdev == NULL) error = ENXIO; else if (vp->v_rdev->si_devsw == NULL) error = ENXIO; else if (!(vp->v_rdev->si_devsw->d_flags & D_DISK)) error = ENOTBLK; dev_unlock(); out: - if (errp != NULL) - *errp = error; + *errp = error; return (error == 0); +} + +bool +vn_isdisk(struct vnode *vp) +{ + int error; + + return (vn_isdisk_error(vp, &error)); } /* * VOP_FPLOOKUP_VEXEC routines are subject to special circumstances, see * the comment above cache_fplookup for details. * * We never deny as priv_check_cred calls are not yet supported, see vaccess. */ int vaccess_vexec_smr(mode_t file_mode, uid_t file_uid, gid_t file_gid, struct ucred *cred) { VFS_SMR_ASSERT_ENTERED(); /* Check the owner. */ if (cred->cr_uid == file_uid) { if (file_mode & S_IXUSR) return (0); return (EAGAIN); } /* Otherwise, check the groups (first match) */ if (groupmember(file_gid, cred)) { if (file_mode & S_IXGRP) return (0); return (EAGAIN); } /* Otherwise, check everyone else. */ if (file_mode & S_IXOTH) return (0); return (EAGAIN); } /* * Common filesystem object access control check routine. Accepts a * vnode's type, "mode", uid and gid, requested access mode, and credentials. * Returns 0 on success, or an errno on failure. */ int vaccess(enum vtype type, mode_t file_mode, uid_t file_uid, gid_t file_gid, accmode_t accmode, struct ucred *cred) { accmode_t dac_granted; accmode_t priv_granted; KASSERT((accmode & ~(VEXEC | VWRITE | VREAD | VADMIN | VAPPEND)) == 0, ("invalid bit in accmode")); KASSERT((accmode & VAPPEND) == 0 || (accmode & VWRITE), ("VAPPEND without VWRITE")); /* * Look for a normal, non-privileged way to access the file/directory * as requested. If it exists, go with that. */ dac_granted = 0; /* Check the owner. */ if (cred->cr_uid == file_uid) { dac_granted |= VADMIN; if (file_mode & S_IXUSR) dac_granted |= VEXEC; if (file_mode & S_IRUSR) dac_granted |= VREAD; if (file_mode & S_IWUSR) dac_granted |= (VWRITE | VAPPEND); if ((accmode & dac_granted) == accmode) return (0); goto privcheck; } /* Otherwise, check the groups (first match) */ if (groupmember(file_gid, cred)) { if (file_mode & S_IXGRP) dac_granted |= VEXEC; if (file_mode & S_IRGRP) dac_granted |= VREAD; if (file_mode & S_IWGRP) dac_granted |= (VWRITE | VAPPEND); if ((accmode & dac_granted) == accmode) return (0); goto privcheck; } /* Otherwise, check everyone else. */ if (file_mode & S_IXOTH) dac_granted |= VEXEC; if (file_mode & S_IROTH) dac_granted |= VREAD; if (file_mode & S_IWOTH) dac_granted |= (VWRITE | VAPPEND); if ((accmode & dac_granted) == accmode) return (0); privcheck: /* * Build a privilege mask to determine if the set of privileges * satisfies the requirements when combined with the granted mask * from above. For each privilege, if the privilege is required, * bitwise or the request type onto the priv_granted mask. */ priv_granted = 0; if (type == VDIR) { /* * For directories, use PRIV_VFS_LOOKUP to satisfy VEXEC * requests, instead of PRIV_VFS_EXEC. */ if ((accmode & VEXEC) && ((dac_granted & VEXEC) == 0) && !priv_check_cred(cred, PRIV_VFS_LOOKUP)) priv_granted |= VEXEC; } else { /* * Ensure that at least one execute bit is on. Otherwise, * a privileged user will always succeed, and we don't want * this to happen unless the file really is executable. */ if ((accmode & VEXEC) && ((dac_granted & VEXEC) == 0) && (file_mode & (S_IXUSR | S_IXGRP | S_IXOTH)) != 0 && !priv_check_cred(cred, PRIV_VFS_EXEC)) priv_granted |= VEXEC; } if ((accmode & VREAD) && ((dac_granted & VREAD) == 0) && !priv_check_cred(cred, PRIV_VFS_READ)) priv_granted |= VREAD; if ((accmode & VWRITE) && ((dac_granted & VWRITE) == 0) && !priv_check_cred(cred, PRIV_VFS_WRITE)) priv_granted |= (VWRITE | VAPPEND); if ((accmode & VADMIN) && ((dac_granted & VADMIN) == 0) && !priv_check_cred(cred, PRIV_VFS_ADMIN)) priv_granted |= VADMIN; if ((accmode & (priv_granted | dac_granted)) == accmode) { return (0); } return ((accmode & VADMIN) ? EPERM : EACCES); } /* * Credential check based on process requesting service, and per-attribute * permissions. */ int extattr_check_cred(struct vnode *vp, int attrnamespace, struct ucred *cred, struct thread *td, accmode_t accmode) { /* * Kernel-invoked always succeeds. */ if (cred == NOCRED) return (0); /* * Do not allow privileged processes in jail to directly manipulate * system attributes. */ switch (attrnamespace) { case EXTATTR_NAMESPACE_SYSTEM: /* Potentially should be: return (EPERM); */ return (priv_check_cred(cred, PRIV_VFS_EXTATTR_SYSTEM)); case EXTATTR_NAMESPACE_USER: return (VOP_ACCESS(vp, accmode, cred, td)); default: return (EPERM); } } #ifdef DEBUG_VFS_LOCKS /* * This only exists to suppress warnings from unlocked specfs accesses. It is * no longer ok to have an unlocked VFS. */ #define IGNORE_LOCK(vp) (KERNEL_PANICKED() || (vp) == NULL || \ (vp)->v_type == VCHR || (vp)->v_type == VBAD) int vfs_badlock_ddb = 1; /* Drop into debugger on violation. */ SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_ddb, CTLFLAG_RW, &vfs_badlock_ddb, 0, "Drop into debugger on lock violation"); int vfs_badlock_mutex = 1; /* Check for interlock across VOPs. */ SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_mutex, CTLFLAG_RW, &vfs_badlock_mutex, 0, "Check for interlock across VOPs"); int vfs_badlock_print = 1; /* Print lock violations. */ SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_print, CTLFLAG_RW, &vfs_badlock_print, 0, "Print lock violations"); int vfs_badlock_vnode = 1; /* Print vnode details on lock violations. */ SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_vnode, CTLFLAG_RW, &vfs_badlock_vnode, 0, "Print vnode details on lock violations"); #ifdef KDB int vfs_badlock_backtrace = 1; /* Print backtrace at lock violations. */ SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_backtrace, CTLFLAG_RW, &vfs_badlock_backtrace, 0, "Print backtrace at lock violations"); #endif static void vfs_badlock(const char *msg, const char *str, struct vnode *vp) { #ifdef KDB if (vfs_badlock_backtrace) kdb_backtrace(); #endif if (vfs_badlock_vnode) vn_printf(vp, "vnode "); if (vfs_badlock_print) printf("%s: %p %s\n", str, (void *)vp, msg); if (vfs_badlock_ddb) kdb_enter(KDB_WHY_VFSLOCK, "lock violation"); } void assert_vi_locked(struct vnode *vp, const char *str) { if (vfs_badlock_mutex && !mtx_owned(VI_MTX(vp))) vfs_badlock("interlock is not locked but should be", str, vp); } void assert_vi_unlocked(struct vnode *vp, const char *str) { if (vfs_badlock_mutex && mtx_owned(VI_MTX(vp))) vfs_badlock("interlock is locked but should not be", str, vp); } void assert_vop_locked(struct vnode *vp, const char *str) { int locked; if (!IGNORE_LOCK(vp)) { locked = VOP_ISLOCKED(vp); if (locked == 0 || locked == LK_EXCLOTHER) vfs_badlock("is not locked but should be", str, vp); } } void assert_vop_unlocked(struct vnode *vp, const char *str) { if (!IGNORE_LOCK(vp) && VOP_ISLOCKED(vp) == LK_EXCLUSIVE) vfs_badlock("is locked but should not be", str, vp); } void assert_vop_elocked(struct vnode *vp, const char *str) { if (!IGNORE_LOCK(vp) && VOP_ISLOCKED(vp) != LK_EXCLUSIVE) vfs_badlock("is not exclusive locked but should be", str, vp); } #endif /* DEBUG_VFS_LOCKS */ void vop_rename_fail(struct vop_rename_args *ap) { if (ap->a_tvp != NULL) vput(ap->a_tvp); if (ap->a_tdvp == ap->a_tvp) vrele(ap->a_tdvp); else vput(ap->a_tdvp); vrele(ap->a_fdvp); vrele(ap->a_fvp); } void vop_rename_pre(void *ap) { struct vop_rename_args *a = ap; #ifdef DEBUG_VFS_LOCKS if (a->a_tvp) ASSERT_VI_UNLOCKED(a->a_tvp, "VOP_RENAME"); ASSERT_VI_UNLOCKED(a->a_tdvp, "VOP_RENAME"); ASSERT_VI_UNLOCKED(a->a_fvp, "VOP_RENAME"); ASSERT_VI_UNLOCKED(a->a_fdvp, "VOP_RENAME"); /* Check the source (from). */ if (a->a_tdvp->v_vnlock != a->a_fdvp->v_vnlock && (a->a_tvp == NULL || a->a_tvp->v_vnlock != a->a_fdvp->v_vnlock)) ASSERT_VOP_UNLOCKED(a->a_fdvp, "vop_rename: fdvp locked"); if (a->a_tvp == NULL || a->a_tvp->v_vnlock != a->a_fvp->v_vnlock) ASSERT_VOP_UNLOCKED(a->a_fvp, "vop_rename: fvp locked"); /* Check the target. */ if (a->a_tvp) ASSERT_VOP_LOCKED(a->a_tvp, "vop_rename: tvp not locked"); ASSERT_VOP_LOCKED(a->a_tdvp, "vop_rename: tdvp not locked"); #endif /* * It may be tempting to add vn_seqc_write_begin/end calls here and * in vop_rename_post but that's not going to work out since some * filesystems relookup vnodes mid-rename. This is probably a bug. * * For now filesystems are expected to do the relevant calls after they * decide what vnodes to operate on. */ if (a->a_tdvp != a->a_fdvp) vhold(a->a_fdvp); if (a->a_tvp != a->a_fvp) vhold(a->a_fvp); vhold(a->a_tdvp); if (a->a_tvp) vhold(a->a_tvp); } #ifdef DEBUG_VFS_LOCKS void vop_fplookup_vexec_debugpre(void *ap __unused) { VFS_SMR_ASSERT_ENTERED(); } void vop_fplookup_vexec_debugpost(void *ap __unused, int rc __unused) { VFS_SMR_ASSERT_ENTERED(); } void vop_strategy_debugpre(void *ap) { struct vop_strategy_args *a; struct buf *bp; a = ap; bp = a->a_bp; /* * Cluster ops lock their component buffers but not the IO container. */ if ((bp->b_flags & B_CLUSTER) != 0) return; if (!KERNEL_PANICKED() && !BUF_ISLOCKED(bp)) { if (vfs_badlock_print) printf( "VOP_STRATEGY: bp is not locked but should be\n"); if (vfs_badlock_ddb) kdb_enter(KDB_WHY_VFSLOCK, "lock violation"); } } void vop_lock_debugpre(void *ap) { struct vop_lock1_args *a = ap; if ((a->a_flags & LK_INTERLOCK) == 0) ASSERT_VI_UNLOCKED(a->a_vp, "VOP_LOCK"); else ASSERT_VI_LOCKED(a->a_vp, "VOP_LOCK"); } void vop_lock_debugpost(void *ap, int rc) { struct vop_lock1_args *a = ap; ASSERT_VI_UNLOCKED(a->a_vp, "VOP_LOCK"); if (rc == 0 && (a->a_flags & LK_EXCLOTHER) == 0) ASSERT_VOP_LOCKED(a->a_vp, "VOP_LOCK"); } void vop_unlock_debugpre(void *ap) { struct vop_unlock_args *a = ap; ASSERT_VOP_LOCKED(a->a_vp, "VOP_UNLOCK"); } void vop_need_inactive_debugpre(void *ap) { struct vop_need_inactive_args *a = ap; ASSERT_VI_LOCKED(a->a_vp, "VOP_NEED_INACTIVE"); } void vop_need_inactive_debugpost(void *ap, int rc) { struct vop_need_inactive_args *a = ap; ASSERT_VI_LOCKED(a->a_vp, "VOP_NEED_INACTIVE"); } #endif void vop_create_pre(void *ap) { struct vop_create_args *a; struct vnode *dvp; a = ap; dvp = a->a_dvp; vn_seqc_write_begin(dvp); } void vop_create_post(void *ap, int rc) { struct vop_create_args *a; struct vnode *dvp; a = ap; dvp = a->a_dvp; vn_seqc_write_end(dvp); if (!rc) VFS_KNOTE_LOCKED(dvp, NOTE_WRITE); } void vop_whiteout_pre(void *ap) { struct vop_whiteout_args *a; struct vnode *dvp; a = ap; dvp = a->a_dvp; vn_seqc_write_begin(dvp); } void vop_whiteout_post(void *ap, int rc) { struct vop_whiteout_args *a; struct vnode *dvp; a = ap; dvp = a->a_dvp; vn_seqc_write_end(dvp); } void vop_deleteextattr_pre(void *ap) { struct vop_deleteextattr_args *a; struct vnode *vp; a = ap; vp = a->a_vp; vn_seqc_write_begin(vp); } void vop_deleteextattr_post(void *ap, int rc) { struct vop_deleteextattr_args *a; struct vnode *vp; a = ap; vp = a->a_vp; vn_seqc_write_end(vp); if (!rc) VFS_KNOTE_LOCKED(a->a_vp, NOTE_ATTRIB); } void vop_link_pre(void *ap) { struct vop_link_args *a; struct vnode *vp, *tdvp; a = ap; vp = a->a_vp; tdvp = a->a_tdvp; vn_seqc_write_begin(vp); vn_seqc_write_begin(tdvp); } void vop_link_post(void *ap, int rc) { struct vop_link_args *a; struct vnode *vp, *tdvp; a = ap; vp = a->a_vp; tdvp = a->a_tdvp; vn_seqc_write_end(vp); vn_seqc_write_end(tdvp); if (!rc) { VFS_KNOTE_LOCKED(vp, NOTE_LINK); VFS_KNOTE_LOCKED(tdvp, NOTE_WRITE); } } void vop_mkdir_pre(void *ap) { struct vop_mkdir_args *a; struct vnode *dvp; a = ap; dvp = a->a_dvp; vn_seqc_write_begin(dvp); } void vop_mkdir_post(void *ap, int rc) { struct vop_mkdir_args *a; struct vnode *dvp; a = ap; dvp = a->a_dvp; vn_seqc_write_end(dvp); if (!rc) VFS_KNOTE_LOCKED(dvp, NOTE_WRITE | NOTE_LINK); } void vop_mknod_pre(void *ap) { struct vop_mknod_args *a; struct vnode *dvp; a = ap; dvp = a->a_dvp; vn_seqc_write_begin(dvp); } void vop_mknod_post(void *ap, int rc) { struct vop_mknod_args *a; struct vnode *dvp; a = ap; dvp = a->a_dvp; vn_seqc_write_end(dvp); if (!rc) VFS_KNOTE_LOCKED(dvp, NOTE_WRITE); } void vop_reclaim_post(void *ap, int rc) { struct vop_reclaim_args *a; struct vnode *vp; a = ap; vp = a->a_vp; ASSERT_VOP_IN_SEQC(vp); if (!rc) VFS_KNOTE_LOCKED(vp, NOTE_REVOKE); } void vop_remove_pre(void *ap) { struct vop_remove_args *a; struct vnode *dvp, *vp; a = ap; dvp = a->a_dvp; vp = a->a_vp; vn_seqc_write_begin(dvp); vn_seqc_write_begin(vp); } void vop_remove_post(void *ap, int rc) { struct vop_remove_args *a; struct vnode *dvp, *vp; a = ap; dvp = a->a_dvp; vp = a->a_vp; vn_seqc_write_end(dvp); vn_seqc_write_end(vp); if (!rc) { VFS_KNOTE_LOCKED(dvp, NOTE_WRITE); VFS_KNOTE_LOCKED(vp, NOTE_DELETE); } } void vop_rename_post(void *ap, int rc) { struct vop_rename_args *a = ap; long hint; if (!rc) { hint = NOTE_WRITE; if (a->a_fdvp == a->a_tdvp) { if (a->a_tvp != NULL && a->a_tvp->v_type == VDIR) hint |= NOTE_LINK; VFS_KNOTE_UNLOCKED(a->a_fdvp, hint); VFS_KNOTE_UNLOCKED(a->a_tdvp, hint); } else { hint |= NOTE_EXTEND; if (a->a_fvp->v_type == VDIR) hint |= NOTE_LINK; VFS_KNOTE_UNLOCKED(a->a_fdvp, hint); if (a->a_fvp->v_type == VDIR && a->a_tvp != NULL && a->a_tvp->v_type == VDIR) hint &= ~NOTE_LINK; VFS_KNOTE_UNLOCKED(a->a_tdvp, hint); } VFS_KNOTE_UNLOCKED(a->a_fvp, NOTE_RENAME); if (a->a_tvp) VFS_KNOTE_UNLOCKED(a->a_tvp, NOTE_DELETE); } if (a->a_tdvp != a->a_fdvp) vdrop(a->a_fdvp); if (a->a_tvp != a->a_fvp) vdrop(a->a_fvp); vdrop(a->a_tdvp); if (a->a_tvp) vdrop(a->a_tvp); } void vop_rmdir_pre(void *ap) { struct vop_rmdir_args *a; struct vnode *dvp, *vp; a = ap; dvp = a->a_dvp; vp = a->a_vp; vn_seqc_write_begin(dvp); vn_seqc_write_begin(vp); } void vop_rmdir_post(void *ap, int rc) { struct vop_rmdir_args *a; struct vnode *dvp, *vp; a = ap; dvp = a->a_dvp; vp = a->a_vp; vn_seqc_write_end(dvp); vn_seqc_write_end(vp); if (!rc) { VFS_KNOTE_LOCKED(dvp, NOTE_WRITE | NOTE_LINK); VFS_KNOTE_LOCKED(vp, NOTE_DELETE); } } void vop_setattr_pre(void *ap) { struct vop_setattr_args *a; struct vnode *vp; a = ap; vp = a->a_vp; vn_seqc_write_begin(vp); } void vop_setattr_post(void *ap, int rc) { struct vop_setattr_args *a; struct vnode *vp; a = ap; vp = a->a_vp; vn_seqc_write_end(vp); if (!rc) VFS_KNOTE_LOCKED(vp, NOTE_ATTRIB); } void vop_setacl_pre(void *ap) { struct vop_setacl_args *a; struct vnode *vp; a = ap; vp = a->a_vp; vn_seqc_write_begin(vp); } void vop_setacl_post(void *ap, int rc __unused) { struct vop_setacl_args *a; struct vnode *vp; a = ap; vp = a->a_vp; vn_seqc_write_end(vp); } void vop_setextattr_pre(void *ap) { struct vop_setextattr_args *a; struct vnode *vp; a = ap; vp = a->a_vp; vn_seqc_write_begin(vp); } void vop_setextattr_post(void *ap, int rc) { struct vop_setextattr_args *a; struct vnode *vp; a = ap; vp = a->a_vp; vn_seqc_write_end(vp); if (!rc) VFS_KNOTE_LOCKED(vp, NOTE_ATTRIB); } void vop_symlink_pre(void *ap) { struct vop_symlink_args *a; struct vnode *dvp; a = ap; dvp = a->a_dvp; vn_seqc_write_begin(dvp); } void vop_symlink_post(void *ap, int rc) { struct vop_symlink_args *a; struct vnode *dvp; a = ap; dvp = a->a_dvp; vn_seqc_write_end(dvp); if (!rc) VFS_KNOTE_LOCKED(dvp, NOTE_WRITE); } void vop_open_post(void *ap, int rc) { struct vop_open_args *a = ap; if (!rc) VFS_KNOTE_LOCKED(a->a_vp, NOTE_OPEN); } void vop_close_post(void *ap, int rc) { struct vop_close_args *a = ap; if (!rc && (a->a_cred != NOCRED || /* filter out revokes */ !VN_IS_DOOMED(a->a_vp))) { VFS_KNOTE_LOCKED(a->a_vp, (a->a_fflag & FWRITE) != 0 ? NOTE_CLOSE_WRITE : NOTE_CLOSE); } } void vop_read_post(void *ap, int rc) { struct vop_read_args *a = ap; if (!rc) VFS_KNOTE_LOCKED(a->a_vp, NOTE_READ); } void vop_readdir_post(void *ap, int rc) { struct vop_readdir_args *a = ap; if (!rc) VFS_KNOTE_LOCKED(a->a_vp, NOTE_READ); } static struct knlist fs_knlist; static void vfs_event_init(void *arg) { knlist_init_mtx(&fs_knlist, NULL); } /* XXX - correct order? */ SYSINIT(vfs_knlist, SI_SUB_VFS, SI_ORDER_ANY, vfs_event_init, NULL); void vfs_event_signal(fsid_t *fsid, uint32_t event, intptr_t data __unused) { KNOTE_UNLOCKED(&fs_knlist, event); } static int filt_fsattach(struct knote *kn); static void filt_fsdetach(struct knote *kn); static int filt_fsevent(struct knote *kn, long hint); struct filterops fs_filtops = { .f_isfd = 0, .f_attach = filt_fsattach, .f_detach = filt_fsdetach, .f_event = filt_fsevent }; static int filt_fsattach(struct knote *kn) { kn->kn_flags |= EV_CLEAR; knlist_add(&fs_knlist, kn, 0); return (0); } static void filt_fsdetach(struct knote *kn) { knlist_remove(&fs_knlist, kn, 0); } static int filt_fsevent(struct knote *kn, long hint) { kn->kn_fflags |= hint; return (kn->kn_fflags != 0); } static int sysctl_vfs_ctl(SYSCTL_HANDLER_ARGS) { struct vfsidctl vc; int error; struct mount *mp; error = SYSCTL_IN(req, &vc, sizeof(vc)); if (error) return (error); if (vc.vc_vers != VFS_CTL_VERS1) return (EINVAL); mp = vfs_getvfs(&vc.vc_fsid); if (mp == NULL) return (ENOENT); /* ensure that a specific sysctl goes to the right filesystem. */ if (strcmp(vc.vc_fstypename, "*") != 0 && strcmp(vc.vc_fstypename, mp->mnt_vfc->vfc_name) != 0) { vfs_rel(mp); return (EINVAL); } VCTLTOREQ(&vc, req); error = VFS_SYSCTL(mp, vc.vc_op, req); vfs_rel(mp); return (error); } SYSCTL_PROC(_vfs, OID_AUTO, ctl, CTLTYPE_OPAQUE | CTLFLAG_MPSAFE | CTLFLAG_WR, NULL, 0, sysctl_vfs_ctl, "", "Sysctl by fsid"); /* * Function to initialize a va_filerev field sensibly. * XXX: Wouldn't a random number make a lot more sense ?? */ u_quad_t init_va_filerev(void) { struct bintime bt; getbinuptime(&bt); return (((u_quad_t)bt.sec << 32LL) | (bt.frac >> 32LL)); } static int filt_vfsread(struct knote *kn, long hint); static int filt_vfswrite(struct knote *kn, long hint); static int filt_vfsvnode(struct knote *kn, long hint); static void filt_vfsdetach(struct knote *kn); static struct filterops vfsread_filtops = { .f_isfd = 1, .f_detach = filt_vfsdetach, .f_event = filt_vfsread }; static struct filterops vfswrite_filtops = { .f_isfd = 1, .f_detach = filt_vfsdetach, .f_event = filt_vfswrite }; static struct filterops vfsvnode_filtops = { .f_isfd = 1, .f_detach = filt_vfsdetach, .f_event = filt_vfsvnode }; static void vfs_knllock(void *arg) { struct vnode *vp = arg; vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); } static void vfs_knlunlock(void *arg) { struct vnode *vp = arg; VOP_UNLOCK(vp); } static void vfs_knl_assert_locked(void *arg) { #ifdef DEBUG_VFS_LOCKS struct vnode *vp = arg; ASSERT_VOP_LOCKED(vp, "vfs_knl_assert_locked"); #endif } static void vfs_knl_assert_unlocked(void *arg) { #ifdef DEBUG_VFS_LOCKS struct vnode *vp = arg; ASSERT_VOP_UNLOCKED(vp, "vfs_knl_assert_unlocked"); #endif } int vfs_kqfilter(struct vop_kqfilter_args *ap) { struct vnode *vp = ap->a_vp; struct knote *kn = ap->a_kn; struct knlist *knl; switch (kn->kn_filter) { case EVFILT_READ: kn->kn_fop = &vfsread_filtops; break; case EVFILT_WRITE: kn->kn_fop = &vfswrite_filtops; break; case EVFILT_VNODE: kn->kn_fop = &vfsvnode_filtops; break; default: return (EINVAL); } kn->kn_hook = (caddr_t)vp; v_addpollinfo(vp); if (vp->v_pollinfo == NULL) return (ENOMEM); knl = &vp->v_pollinfo->vpi_selinfo.si_note; vhold(vp); knlist_add(knl, kn, 0); return (0); } /* * Detach knote from vnode */ static void filt_vfsdetach(struct knote *kn) { struct vnode *vp = (struct vnode *)kn->kn_hook; KASSERT(vp->v_pollinfo != NULL, ("Missing v_pollinfo")); knlist_remove(&vp->v_pollinfo->vpi_selinfo.si_note, kn, 0); vdrop(vp); } /*ARGSUSED*/ static int filt_vfsread(struct knote *kn, long hint) { struct vnode *vp = (struct vnode *)kn->kn_hook; struct vattr va; int res; /* * filesystem is gone, so set the EOF flag and schedule * the knote for deletion. */ if (hint == NOTE_REVOKE || (hint == 0 && vp->v_type == VBAD)) { VI_LOCK(vp); kn->kn_flags |= (EV_EOF | EV_ONESHOT); VI_UNLOCK(vp); return (1); } if (VOP_GETATTR(vp, &va, curthread->td_ucred)) return (0); VI_LOCK(vp); kn->kn_data = va.va_size - kn->kn_fp->f_offset; res = (kn->kn_sfflags & NOTE_FILE_POLL) != 0 || kn->kn_data != 0; VI_UNLOCK(vp); return (res); } /*ARGSUSED*/ static int filt_vfswrite(struct knote *kn, long hint) { struct vnode *vp = (struct vnode *)kn->kn_hook; VI_LOCK(vp); /* * filesystem is gone, so set the EOF flag and schedule * the knote for deletion. */ if (hint == NOTE_REVOKE || (hint == 0 && vp->v_type == VBAD)) kn->kn_flags |= (EV_EOF | EV_ONESHOT); kn->kn_data = 0; VI_UNLOCK(vp); return (1); } static int filt_vfsvnode(struct knote *kn, long hint) { struct vnode *vp = (struct vnode *)kn->kn_hook; int res; VI_LOCK(vp); if (kn->kn_sfflags & hint) kn->kn_fflags |= hint; if (hint == NOTE_REVOKE || (hint == 0 && vp->v_type == VBAD)) { kn->kn_flags |= EV_EOF; VI_UNLOCK(vp); return (1); } res = (kn->kn_fflags != 0); VI_UNLOCK(vp); return (res); } /* * Returns whether the directory is empty or not. * If it is empty, the return value is 0; otherwise * the return value is an error value (which may * be ENOTEMPTY). */ int vfs_emptydir(struct vnode *vp) { struct uio uio; struct iovec iov; struct dirent *dirent, *dp, *endp; int error, eof; error = 0; eof = 0; ASSERT_VOP_LOCKED(vp, "vfs_emptydir"); dirent = malloc(sizeof(struct dirent), M_TEMP, M_WAITOK); iov.iov_base = dirent; iov.iov_len = sizeof(struct dirent); uio.uio_iov = &iov; uio.uio_iovcnt = 1; uio.uio_offset = 0; uio.uio_resid = sizeof(struct dirent); uio.uio_segflg = UIO_SYSSPACE; uio.uio_rw = UIO_READ; uio.uio_td = curthread; while (eof == 0 && error == 0) { error = VOP_READDIR(vp, &uio, curthread->td_ucred, &eof, NULL, NULL); if (error != 0) break; endp = (void *)((uint8_t *)dirent + sizeof(struct dirent) - uio.uio_resid); for (dp = dirent; dp < endp; dp = (void *)((uint8_t *)dp + GENERIC_DIRSIZ(dp))) { if (dp->d_type == DT_WHT) continue; if (dp->d_namlen == 0) continue; if (dp->d_type != DT_DIR && dp->d_type != DT_UNKNOWN) { error = ENOTEMPTY; break; } if (dp->d_namlen > 2) { error = ENOTEMPTY; break; } if (dp->d_namlen == 1 && dp->d_name[0] != '.') { error = ENOTEMPTY; break; } if (dp->d_namlen == 2 && dp->d_name[1] != '.') { error = ENOTEMPTY; break; } uio.uio_resid = sizeof(struct dirent); } } free(dirent, M_TEMP); return (error); } int vfs_read_dirent(struct vop_readdir_args *ap, struct dirent *dp, off_t off) { int error; if (dp->d_reclen > ap->a_uio->uio_resid) return (ENAMETOOLONG); error = uiomove(dp, dp->d_reclen, ap->a_uio); if (error) { if (ap->a_ncookies != NULL) { if (ap->a_cookies != NULL) free(ap->a_cookies, M_TEMP); ap->a_cookies = NULL; *ap->a_ncookies = 0; } return (error); } if (ap->a_ncookies == NULL) return (0); KASSERT(ap->a_cookies, ("NULL ap->a_cookies value with non-NULL ap->a_ncookies!")); *ap->a_cookies = realloc(*ap->a_cookies, (*ap->a_ncookies + 1) * sizeof(u_long), M_TEMP, M_WAITOK | M_ZERO); (*ap->a_cookies)[*ap->a_ncookies] = off; *ap->a_ncookies += 1; return (0); } /* * The purpose of this routine is to remove granularity from accmode_t, * reducing it into standard unix access bits - VEXEC, VREAD, VWRITE, * VADMIN and VAPPEND. * * If it returns 0, the caller is supposed to continue with the usual * access checks using 'accmode' as modified by this routine. If it * returns nonzero value, the caller is supposed to return that value * as errno. * * Note that after this routine runs, accmode may be zero. */ int vfs_unixify_accmode(accmode_t *accmode) { /* * There is no way to specify explicit "deny" rule using * file mode or POSIX.1e ACLs. */ if (*accmode & VEXPLICIT_DENY) { *accmode = 0; return (0); } /* * None of these can be translated into usual access bits. * Also, the common case for NFSv4 ACLs is to not contain * either of these bits. Caller should check for VWRITE * on the containing directory instead. */ if (*accmode & (VDELETE_CHILD | VDELETE)) return (EPERM); if (*accmode & VADMIN_PERMS) { *accmode &= ~VADMIN_PERMS; *accmode |= VADMIN; } /* * There is no way to deny VREAD_ATTRIBUTES, VREAD_ACL * or VSYNCHRONIZE using file mode or POSIX.1e ACL. */ *accmode &= ~(VSTAT_PERMS | VSYNCHRONIZE); return (0); } /* * Clear out a doomed vnode (if any) and replace it with a new one as long * as the fs is not being unmounted. Return the root vnode to the caller. */ static int __noinline vfs_cache_root_fallback(struct mount *mp, int flags, struct vnode **vpp) { struct vnode *vp; int error; restart: if (mp->mnt_rootvnode != NULL) { MNT_ILOCK(mp); vp = mp->mnt_rootvnode; if (vp != NULL) { if (!VN_IS_DOOMED(vp)) { vrefact(vp); MNT_IUNLOCK(mp); error = vn_lock(vp, flags); if (error == 0) { *vpp = vp; return (0); } vrele(vp); goto restart; } /* * Clear the old one. */ mp->mnt_rootvnode = NULL; } MNT_IUNLOCK(mp); if (vp != NULL) { vfs_op_barrier_wait(mp); vrele(vp); } } error = VFS_CACHEDROOT(mp, flags, vpp); if (error != 0) return (error); if (mp->mnt_vfs_ops == 0) { MNT_ILOCK(mp); if (mp->mnt_vfs_ops != 0) { MNT_IUNLOCK(mp); return (0); } if (mp->mnt_rootvnode == NULL) { vrefact(*vpp); mp->mnt_rootvnode = *vpp; } else { if (mp->mnt_rootvnode != *vpp) { if (!VN_IS_DOOMED(mp->mnt_rootvnode)) { panic("%s: mismatch between vnode returned " " by VFS_CACHEDROOT and the one cached " " (%p != %p)", __func__, *vpp, mp->mnt_rootvnode); } } } MNT_IUNLOCK(mp); } return (0); } int vfs_cache_root(struct mount *mp, int flags, struct vnode **vpp) { struct vnode *vp; int error; if (!vfs_op_thread_enter(mp)) return (vfs_cache_root_fallback(mp, flags, vpp)); vp = atomic_load_ptr(&mp->mnt_rootvnode); if (vp == NULL || VN_IS_DOOMED(vp)) { vfs_op_thread_exit(mp); return (vfs_cache_root_fallback(mp, flags, vpp)); } vrefact(vp); vfs_op_thread_exit(mp); error = vn_lock(vp, flags); if (error != 0) { vrele(vp); return (vfs_cache_root_fallback(mp, flags, vpp)); } *vpp = vp; return (0); } struct vnode * vfs_cache_root_clear(struct mount *mp) { struct vnode *vp; /* * ops > 0 guarantees there is nobody who can see this vnode */ MPASS(mp->mnt_vfs_ops > 0); vp = mp->mnt_rootvnode; if (vp != NULL) vn_seqc_write_begin(vp); mp->mnt_rootvnode = NULL; return (vp); } void vfs_cache_root_set(struct mount *mp, struct vnode *vp) { MPASS(mp->mnt_vfs_ops > 0); vrefact(vp); mp->mnt_rootvnode = vp; } /* * These are helper functions for filesystems to traverse all * their vnodes. See MNT_VNODE_FOREACH_ALL() in sys/mount.h. * * This interface replaces MNT_VNODE_FOREACH. */ struct vnode * __mnt_vnode_next_all(struct vnode **mvp, struct mount *mp) { struct vnode *vp; if (should_yield()) kern_yield(PRI_USER); MNT_ILOCK(mp); KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch")); for (vp = TAILQ_NEXT(*mvp, v_nmntvnodes); vp != NULL; vp = TAILQ_NEXT(vp, v_nmntvnodes)) { /* Allow a racy peek at VIRF_DOOMED to save a lock acquisition. */ if (vp->v_type == VMARKER || VN_IS_DOOMED(vp)) continue; VI_LOCK(vp); if (VN_IS_DOOMED(vp)) { VI_UNLOCK(vp); continue; } break; } if (vp == NULL) { __mnt_vnode_markerfree_all(mvp, mp); /* MNT_IUNLOCK(mp); -- done in above function */ mtx_assert(MNT_MTX(mp), MA_NOTOWNED); return (NULL); } TAILQ_REMOVE(&mp->mnt_nvnodelist, *mvp, v_nmntvnodes); TAILQ_INSERT_AFTER(&mp->mnt_nvnodelist, vp, *mvp, v_nmntvnodes); MNT_IUNLOCK(mp); return (vp); } struct vnode * __mnt_vnode_first_all(struct vnode **mvp, struct mount *mp) { struct vnode *vp; *mvp = vn_alloc_marker(mp); MNT_ILOCK(mp); MNT_REF(mp); TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) { /* Allow a racy peek at VIRF_DOOMED to save a lock acquisition. */ if (vp->v_type == VMARKER || VN_IS_DOOMED(vp)) continue; VI_LOCK(vp); if (VN_IS_DOOMED(vp)) { VI_UNLOCK(vp); continue; } break; } if (vp == NULL) { MNT_REL(mp); MNT_IUNLOCK(mp); vn_free_marker(*mvp); *mvp = NULL; return (NULL); } TAILQ_INSERT_AFTER(&mp->mnt_nvnodelist, vp, *mvp, v_nmntvnodes); MNT_IUNLOCK(mp); return (vp); } void __mnt_vnode_markerfree_all(struct vnode **mvp, struct mount *mp) { if (*mvp == NULL) { MNT_IUNLOCK(mp); return; } mtx_assert(MNT_MTX(mp), MA_OWNED); KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch")); TAILQ_REMOVE(&mp->mnt_nvnodelist, *mvp, v_nmntvnodes); MNT_REL(mp); MNT_IUNLOCK(mp); vn_free_marker(*mvp); *mvp = NULL; } /* * These are helper functions for filesystems to traverse their * lazy vnodes. See MNT_VNODE_FOREACH_LAZY() in sys/mount.h */ static void mnt_vnode_markerfree_lazy(struct vnode **mvp, struct mount *mp) { KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch")); MNT_ILOCK(mp); MNT_REL(mp); MNT_IUNLOCK(mp); vn_free_marker(*mvp); *mvp = NULL; } /* * Relock the mp mount vnode list lock with the vp vnode interlock in the * conventional lock order during mnt_vnode_next_lazy iteration. * * On entry, the mount vnode list lock is held and the vnode interlock is not. * The list lock is dropped and reacquired. On success, both locks are held. * On failure, the mount vnode list lock is held but the vnode interlock is * not, and the procedure may have yielded. */ static bool mnt_vnode_next_lazy_relock(struct vnode *mvp, struct mount *mp, struct vnode *vp) { VNASSERT(mvp->v_mount == mp && mvp->v_type == VMARKER && TAILQ_NEXT(mvp, v_lazylist) != NULL, mvp, ("%s: bad marker", __func__)); VNASSERT(vp->v_mount == mp && vp->v_type != VMARKER, vp, ("%s: inappropriate vnode", __func__)); ASSERT_VI_UNLOCKED(vp, __func__); mtx_assert(&mp->mnt_listmtx, MA_OWNED); TAILQ_REMOVE(&mp->mnt_lazyvnodelist, mvp, v_lazylist); TAILQ_INSERT_BEFORE(vp, mvp, v_lazylist); /* * Note we may be racing against vdrop which transitioned the hold * count to 0 and now waits for the ->mnt_listmtx lock. This is fine, * if we are the only user after we get the interlock we will just * vdrop. */ vhold(vp); mtx_unlock(&mp->mnt_listmtx); VI_LOCK(vp); if (VN_IS_DOOMED(vp)) { VNPASS((vp->v_mflag & VMP_LAZYLIST) == 0, vp); goto out_lost; } VNPASS(vp->v_mflag & VMP_LAZYLIST, vp); /* * There is nothing to do if we are the last user. */ if (!refcount_release_if_not_last(&vp->v_holdcnt)) goto out_lost; mtx_lock(&mp->mnt_listmtx); return (true); out_lost: vdropl(vp); maybe_yield(); mtx_lock(&mp->mnt_listmtx); return (false); } static struct vnode * mnt_vnode_next_lazy(struct vnode **mvp, struct mount *mp, mnt_lazy_cb_t *cb, void *cbarg) { struct vnode *vp; mtx_assert(&mp->mnt_listmtx, MA_OWNED); KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch")); restart: vp = TAILQ_NEXT(*mvp, v_lazylist); while (vp != NULL) { if (vp->v_type == VMARKER) { vp = TAILQ_NEXT(vp, v_lazylist); continue; } /* * See if we want to process the vnode. Note we may encounter a * long string of vnodes we don't care about and hog the list * as a result. Check for it and requeue the marker. */ VNPASS(!VN_IS_DOOMED(vp), vp); if (!cb(vp, cbarg)) { if (!should_yield()) { vp = TAILQ_NEXT(vp, v_lazylist); continue; } TAILQ_REMOVE(&mp->mnt_lazyvnodelist, *mvp, v_lazylist); TAILQ_INSERT_AFTER(&mp->mnt_lazyvnodelist, vp, *mvp, v_lazylist); mtx_unlock(&mp->mnt_listmtx); kern_yield(PRI_USER); mtx_lock(&mp->mnt_listmtx); goto restart; } /* * Try-lock because this is the wrong lock order. */ if (!VI_TRYLOCK(vp) && !mnt_vnode_next_lazy_relock(*mvp, mp, vp)) goto restart; KASSERT(vp->v_type != VMARKER, ("locked marker %p", vp)); KASSERT(vp->v_mount == mp || vp->v_mount == NULL, ("alien vnode on the lazy list %p %p", vp, mp)); VNPASS(vp->v_mount == mp, vp); VNPASS(!VN_IS_DOOMED(vp), vp); break; } TAILQ_REMOVE(&mp->mnt_lazyvnodelist, *mvp, v_lazylist); /* Check if we are done */ if (vp == NULL) { mtx_unlock(&mp->mnt_listmtx); mnt_vnode_markerfree_lazy(mvp, mp); return (NULL); } TAILQ_INSERT_AFTER(&mp->mnt_lazyvnodelist, vp, *mvp, v_lazylist); mtx_unlock(&mp->mnt_listmtx); ASSERT_VI_LOCKED(vp, "lazy iter"); return (vp); } struct vnode * __mnt_vnode_next_lazy(struct vnode **mvp, struct mount *mp, mnt_lazy_cb_t *cb, void *cbarg) { if (should_yield()) kern_yield(PRI_USER); mtx_lock(&mp->mnt_listmtx); return (mnt_vnode_next_lazy(mvp, mp, cb, cbarg)); } struct vnode * __mnt_vnode_first_lazy(struct vnode **mvp, struct mount *mp, mnt_lazy_cb_t *cb, void *cbarg) { struct vnode *vp; if (TAILQ_EMPTY(&mp->mnt_lazyvnodelist)) return (NULL); *mvp = vn_alloc_marker(mp); MNT_ILOCK(mp); MNT_REF(mp); MNT_IUNLOCK(mp); mtx_lock(&mp->mnt_listmtx); vp = TAILQ_FIRST(&mp->mnt_lazyvnodelist); if (vp == NULL) { mtx_unlock(&mp->mnt_listmtx); mnt_vnode_markerfree_lazy(mvp, mp); return (NULL); } TAILQ_INSERT_BEFORE(vp, *mvp, v_lazylist); return (mnt_vnode_next_lazy(mvp, mp, cb, cbarg)); } void __mnt_vnode_markerfree_lazy(struct vnode **mvp, struct mount *mp) { if (*mvp == NULL) return; mtx_lock(&mp->mnt_listmtx); TAILQ_REMOVE(&mp->mnt_lazyvnodelist, *mvp, v_lazylist); mtx_unlock(&mp->mnt_listmtx); mnt_vnode_markerfree_lazy(mvp, mp); } int vn_dir_check_exec(struct vnode *vp, struct componentname *cnp) { if ((cnp->cn_flags & NOEXECCHECK) != 0) { cnp->cn_flags &= ~NOEXECCHECK; return (0); } return (VOP_ACCESS(vp, VEXEC, cnp->cn_cred, cnp->cn_thread)); } /* * Do not use this variant unless you have means other than the hold count * to prevent the vnode from getting freed. */ void vn_seqc_write_begin_unheld_locked(struct vnode *vp) { ASSERT_VI_LOCKED(vp, __func__); VNPASS(vp->v_seqc_users >= 0, vp); vp->v_seqc_users++; if (vp->v_seqc_users == 1) seqc_sleepable_write_begin(&vp->v_seqc); } void vn_seqc_write_begin_locked(struct vnode *vp) { ASSERT_VI_LOCKED(vp, __func__); VNPASS(vp->v_holdcnt > 0, vp); vn_seqc_write_begin_unheld_locked(vp); } void vn_seqc_write_begin(struct vnode *vp) { VI_LOCK(vp); vn_seqc_write_begin_locked(vp); VI_UNLOCK(vp); } void vn_seqc_write_begin_unheld(struct vnode *vp) { VI_LOCK(vp); vn_seqc_write_begin_unheld_locked(vp); VI_UNLOCK(vp); } void vn_seqc_write_end_locked(struct vnode *vp) { ASSERT_VI_LOCKED(vp, __func__); VNPASS(vp->v_seqc_users > 0, vp); vp->v_seqc_users--; if (vp->v_seqc_users == 0) seqc_sleepable_write_end(&vp->v_seqc); } void vn_seqc_write_end(struct vnode *vp) { VI_LOCK(vp); vn_seqc_write_end_locked(vp); VI_UNLOCK(vp); } Index: head/sys/sys/vnode.h =================================================================== --- head/sys/sys/vnode.h (revision 364371) +++ head/sys/sys/vnode.h (revision 364372) @@ -1,1076 +1,1077 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1989, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)vnode.h 8.7 (Berkeley) 2/4/94 * $FreeBSD$ */ #ifndef _SYS_VNODE_H_ #define _SYS_VNODE_H_ #include #include #include #include #include #include #include #include #include #include #include /* * The vnode is the focus of all file activity in UNIX. There is a * unique vnode allocated for each active file, each current directory, * each mounted-on file, text file, and the root. */ /* * Vnode types. VNON means no type. */ enum vtype { VNON, VREG, VDIR, VBLK, VCHR, VLNK, VSOCK, VFIFO, VBAD, VMARKER }; enum vgetstate { VGET_NONE, VGET_HOLDCNT, VGET_USECOUNT }; /* * Each underlying filesystem allocates its own private area and hangs * it from v_data. If non-null, this area is freed in getnewvnode(). */ struct namecache; struct vpollinfo { struct mtx vpi_lock; /* lock to protect below */ struct selinfo vpi_selinfo; /* identity of poller(s) */ short vpi_events; /* what they are looking for */ short vpi_revents; /* what has happened */ }; /* * Reading or writing any of these items requires holding the appropriate lock. * * Lock reference: * c - namecache mutex * i - interlock * l - mp mnt_listmtx or freelist mutex * I - updated with atomics, 0->1 and 1->0 transitions with interlock held * m - mount point interlock * p - pollinfo lock * u - Only a reference to the vnode is needed to read. * v - vnode lock * * Vnodes may be found on many lists. The general way to deal with operating * on a vnode that is on a list is: * 1) Lock the list and find the vnode. * 2) Lock interlock so that the vnode does not go away. * 3) Unlock the list to avoid lock order reversals. * 4) vget with LK_INTERLOCK and check for ENOENT, or * 5) Check for DOOMED if the vnode lock is not required. * 6) Perform your operation, then vput(). */ #if defined(_KERNEL) || defined(_KVM_VNODE) struct vnode { /* * Fields which define the identity of the vnode. These fields are * owned by the filesystem (XXX: and vgone() ?) */ enum vtype v_type:8; /* u vnode type */ short v_irflag; /* i frequently read flags */ seqc_t v_seqc; /* i modification count */ uint32_t v_nchash; /* u namecache hash */ struct vop_vector *v_op; /* u vnode operations vector */ void *v_data; /* u private data for fs */ /* * Filesystem instance stuff */ struct mount *v_mount; /* u ptr to vfs we are in */ TAILQ_ENTRY(vnode) v_nmntvnodes; /* m vnodes for mount point */ /* * Type specific fields, only one applies to any given vnode. */ union { struct mount *v_mountedhere; /* v ptr to mountpoint (VDIR) */ struct unpcb *v_unpcb; /* v unix domain net (VSOCK) */ struct cdev *v_rdev; /* v device (VCHR, VBLK) */ struct fifoinfo *v_fifoinfo; /* v fifo (VFIFO) */ }; /* * vfs_hash: (mount + inode) -> vnode hash. The hash value * itself is grouped with other int fields, to avoid padding. */ LIST_ENTRY(vnode) v_hashlist; /* * VFS_namecache stuff */ LIST_HEAD(, namecache) v_cache_src; /* c Cache entries from us */ TAILQ_HEAD(, namecache) v_cache_dst; /* c Cache entries to us */ struct namecache *v_cache_dd; /* c Cache entry for .. vnode */ /* * Locking */ struct lock v_lock; /* u (if fs don't have one) */ struct mtx v_interlock; /* lock for "i" things */ struct lock *v_vnlock; /* u pointer to vnode lock */ /* * The machinery of being a vnode */ TAILQ_ENTRY(vnode) v_vnodelist; /* l vnode lists */ TAILQ_ENTRY(vnode) v_lazylist; /* l vnode lazy list */ struct bufobj v_bufobj; /* * Buffer cache object */ /* * Hooks for various subsystems and features. */ struct vpollinfo *v_pollinfo; /* i Poll events, p for *v_pi */ struct label *v_label; /* MAC label for vnode */ struct lockf *v_lockf; /* Byte-level advisory lock list */ struct rangelock v_rl; /* Byte-range lock */ /* * clustering stuff */ daddr_t v_cstart; /* v start block of cluster */ daddr_t v_lasta; /* v last allocation */ daddr_t v_lastw; /* v last write */ int v_clen; /* v length of cur. cluster */ u_int v_holdcnt; /* I prevents recycling. */ u_int v_usecount; /* I ref count of users */ u_short v_iflag; /* i vnode flags (see below) */ u_short v_vflag; /* v vnode flags */ u_short v_mflag; /* l mnt-specific vnode flags */ short v_dbatchcpu; /* i LRU requeue deferral batch */ int v_writecount; /* I ref count of writers or (negative) text users */ int v_seqc_users; /* i modifications pending */ u_int v_hash; }; #endif /* defined(_KERNEL) || defined(_KVM_VNODE) */ #define bo2vnode(bo) __containerof((bo), struct vnode, v_bufobj) /* XXX: These are temporary to avoid a source sweep at this time */ #define v_object v_bufobj.bo_object /* * Userland version of struct vnode, for sysctl. */ struct xvnode { size_t xv_size; /* sizeof(struct xvnode) */ void *xv_vnode; /* address of real vnode */ u_long xv_flag; /* vnode vflags */ int xv_usecount; /* reference count of users */ int xv_writecount; /* reference count of writers */ int xv_holdcnt; /* page & buffer references */ u_long xv_id; /* capability identifier */ void *xv_mount; /* address of parent mount */ long xv_numoutput; /* num of writes in progress */ enum vtype xv_type; /* vnode type */ union { void *xvu_socket; /* unpcb, if VSOCK */ void *xvu_fifo; /* fifo, if VFIFO */ dev_t xvu_rdev; /* maj/min, if VBLK/VCHR */ struct { dev_t xvu_dev; /* device, if VDIR/VREG/VLNK */ ino_t xvu_ino; /* id, if VDIR/VREG/VLNK */ } xv_uns; } xv_un; }; #define xv_socket xv_un.xvu_socket #define xv_fifo xv_un.xvu_fifo #define xv_rdev xv_un.xvu_rdev #define xv_dev xv_un.xv_uns.xvu_dev #define xv_ino xv_un.xv_uns.xvu_ino /* We don't need to lock the knlist */ #define VN_KNLIST_EMPTY(vp) ((vp)->v_pollinfo == NULL || \ KNLIST_EMPTY(&(vp)->v_pollinfo->vpi_selinfo.si_note)) #define VN_KNOTE(vp, b, a) \ do { \ if (!VN_KNLIST_EMPTY(vp)) \ KNOTE(&vp->v_pollinfo->vpi_selinfo.si_note, (b), \ (a) | KNF_NOKQLOCK); \ } while (0) #define VN_KNOTE_LOCKED(vp, b) VN_KNOTE(vp, b, KNF_LISTLOCKED) #define VN_KNOTE_UNLOCKED(vp, b) VN_KNOTE(vp, b, 0) /* * Vnode flags. * VI flags are protected by interlock and live in v_iflag * VV flags are protected by the vnode lock and live in v_vflag * * VIRF_DOOMED is doubly protected by the interlock and vnode lock. Both * are required for writing but the status may be checked with either. */ #define VHOLD_NO_SMR (1<<29) /* Disable vhold_smr */ #define VHOLD_ALL_FLAGS (VHOLD_NO_SMR) #define VIRF_DOOMED 0x0001 /* This vnode is being recycled */ #define VIRF_PGREAD 0x0002 /* Direct reads from the page cache are permitted, never cleared once set */ #define VI_TEXT_REF 0x0001 /* Text ref grabbed use ref */ #define VI_MOUNT 0x0002 /* Mount in progress */ #define VI_DOINGINACT 0x0004 /* VOP_INACTIVE is in progress */ #define VI_OWEINACT 0x0008 /* Need to call inactive */ #define VI_DEFINACT 0x0010 /* deferred inactive */ #define VV_ROOT 0x0001 /* root of its filesystem */ #define VV_ISTTY 0x0002 /* vnode represents a tty */ #define VV_NOSYNC 0x0004 /* unlinked, stop syncing */ #define VV_ETERNALDEV 0x0008 /* device that is never destroyed */ #define VV_CACHEDLABEL 0x0010 /* Vnode has valid cached MAC label */ #define VV_VMSIZEVNLOCK 0x0020 /* object size check requires vnode lock */ #define VV_COPYONWRITE 0x0040 /* vnode is doing copy-on-write */ #define VV_SYSTEM 0x0080 /* vnode being used by kernel */ #define VV_PROCDEP 0x0100 /* vnode is process dependent */ #define VV_NOKNOTE 0x0200 /* don't activate knotes on this vnode */ #define VV_DELETED 0x0400 /* should be removed */ #define VV_MD 0x0800 /* vnode backs the md device */ #define VV_FORCEINSMQ 0x1000 /* force the insmntque to succeed */ #define VV_READLINK 0x2000 /* fdescfs linux vnode */ #define VMP_LAZYLIST 0x0001 /* Vnode is on mnt's lazy list */ /* * Vnode attributes. A field value of VNOVAL represents a field whose value * is unavailable (getattr) or which is not to be changed (setattr). */ struct vattr { enum vtype va_type; /* vnode type (for create) */ u_short va_mode; /* files access mode and type */ u_short va_padding0; uid_t va_uid; /* owner user id */ gid_t va_gid; /* owner group id */ nlink_t va_nlink; /* number of references to file */ dev_t va_fsid; /* filesystem id */ ino_t va_fileid; /* file id */ u_quad_t va_size; /* file size in bytes */ long va_blocksize; /* blocksize preferred for i/o */ struct timespec va_atime; /* time of last access */ struct timespec va_mtime; /* time of last modification */ struct timespec va_ctime; /* time file changed */ struct timespec va_birthtime; /* time file created */ u_long va_gen; /* generation number of file */ u_long va_flags; /* flags defined for file */ dev_t va_rdev; /* device the special file represents */ u_quad_t va_bytes; /* bytes of disk space held by file */ u_quad_t va_filerev; /* file modification number */ u_int va_vaflags; /* operations flags, see below */ long va_spare; /* remain quad aligned */ }; /* * Flags for va_vaflags. */ #define VA_UTIMES_NULL 0x01 /* utimes argument was NULL */ #define VA_EXCLUSIVE 0x02 /* exclusive create request */ #define VA_SYNC 0x04 /* O_SYNC truncation */ /* * Flags for ioflag. (high 16 bits used to ask for read-ahead and * help with write clustering) * NB: IO_NDELAY and IO_DIRECT are linked to fcntl.h */ #define IO_UNIT 0x0001 /* do I/O as atomic unit */ #define IO_APPEND 0x0002 /* append write to end */ #define IO_NDELAY 0x0004 /* FNDELAY flag set in file table */ #define IO_NODELOCKED 0x0008 /* underlying node already locked */ #define IO_ASYNC 0x0010 /* bawrite rather then bdwrite */ #define IO_VMIO 0x0020 /* data already in VMIO space */ #define IO_INVAL 0x0040 /* invalidate after I/O */ #define IO_SYNC 0x0080 /* do I/O synchronously */ #define IO_DIRECT 0x0100 /* attempt to bypass buffer cache */ #define IO_NOREUSE 0x0200 /* VMIO data won't be reused */ #define IO_EXT 0x0400 /* operate on external attributes */ #define IO_NORMAL 0x0800 /* operate on regular data */ #define IO_NOMACCHECK 0x1000 /* MAC checks unnecessary */ #define IO_BUFLOCKED 0x2000 /* ffs flag; indir buf is locked */ #define IO_RANGELOCKED 0x4000 /* range locked */ #define IO_SEQMAX 0x7F /* seq heuristic max value */ #define IO_SEQSHIFT 16 /* seq heuristic in upper 16 bits */ /* * Flags for accmode_t. */ #define VEXEC 000000000100 /* execute/search permission */ #define VWRITE 000000000200 /* write permission */ #define VREAD 000000000400 /* read permission */ #define VADMIN 000000010000 /* being the file owner */ #define VAPPEND 000000040000 /* permission to write/append */ /* * VEXPLICIT_DENY makes VOP_ACCESSX(9) return EPERM or EACCES only * if permission was denied explicitly, by a "deny" rule in NFSv4 ACL, * and 0 otherwise. This never happens with ordinary unix access rights * or POSIX.1e ACLs. Obviously, VEXPLICIT_DENY must be OR-ed with * some other V* constant. */ #define VEXPLICIT_DENY 000000100000 #define VREAD_NAMED_ATTRS 000000200000 /* not used */ #define VWRITE_NAMED_ATTRS 000000400000 /* not used */ #define VDELETE_CHILD 000001000000 #define VREAD_ATTRIBUTES 000002000000 /* permission to stat(2) */ #define VWRITE_ATTRIBUTES 000004000000 /* change {m,c,a}time */ #define VDELETE 000010000000 #define VREAD_ACL 000020000000 /* read ACL and file mode */ #define VWRITE_ACL 000040000000 /* change ACL and/or file mode */ #define VWRITE_OWNER 000100000000 /* change file owner */ #define VSYNCHRONIZE 000200000000 /* not used */ #define VCREAT 000400000000 /* creating new file */ #define VVERIFY 001000000000 /* verification required */ /* * Permissions that were traditionally granted only to the file owner. */ #define VADMIN_PERMS (VADMIN | VWRITE_ATTRIBUTES | VWRITE_ACL | \ VWRITE_OWNER) /* * Permissions that were traditionally granted to everyone. */ #define VSTAT_PERMS (VREAD_ATTRIBUTES | VREAD_ACL) /* * Permissions that allow to change the state of the file in any way. */ #define VMODIFY_PERMS (VWRITE | VAPPEND | VADMIN_PERMS | VDELETE_CHILD | \ VDELETE) /* * Token indicating no attribute value yet assigned. */ #define VNOVAL (-1) /* * LK_TIMELOCK timeout for vnode locks (used mainly by the pageout daemon) */ #define VLKTIMEOUT (hz / 20 + 1) #ifdef _KERNEL #ifdef MALLOC_DECLARE MALLOC_DECLARE(M_VNODE); #endif extern u_int ncsizefactor; /* * Convert between vnode types and inode formats (since POSIX.1 * defines mode word of stat structure in terms of inode formats). */ extern enum vtype iftovt_tab[]; extern int vttoif_tab[]; #define IFTOVT(mode) (iftovt_tab[((mode) & S_IFMT) >> 12]) #define VTTOIF(indx) (vttoif_tab[(int)(indx)]) #define MAKEIMODE(indx, mode) (int)(VTTOIF(indx) | (mode)) /* * Flags to various vnode functions. */ #define SKIPSYSTEM 0x0001 /* vflush: skip vnodes marked VSYSTEM */ #define FORCECLOSE 0x0002 /* vflush: force file closure */ #define WRITECLOSE 0x0004 /* vflush: only close writable files */ #define EARLYFLUSH 0x0008 /* vflush: early call for ffs_flushfiles */ #define V_SAVE 0x0001 /* vinvalbuf: sync file first */ #define V_ALT 0x0002 /* vinvalbuf: invalidate only alternate bufs */ #define V_NORMAL 0x0004 /* vinvalbuf: invalidate only regular bufs */ #define V_CLEANONLY 0x0008 /* vinvalbuf: invalidate only clean bufs */ #define V_VMIO 0x0010 /* vinvalbuf: called during pageout */ #define V_ALLOWCLEAN 0x0020 /* vinvalbuf: allow clean buffers after flush */ #define REVOKEALL 0x0001 /* vop_revoke: revoke all aliases */ #define V_WAIT 0x0001 /* vn_start_write: sleep for suspend */ #define V_NOWAIT 0x0002 /* vn_start_write: don't sleep for suspend */ #define V_XSLEEP 0x0004 /* vn_start_write: just return after sleep */ #define V_MNTREF 0x0010 /* vn_start_write: mp is already ref-ed */ #define VR_START_WRITE 0x0001 /* vfs_write_resume: start write atomically */ #define VR_NO_SUSPCLR 0x0002 /* vfs_write_resume: do not clear suspension */ #define VS_SKIP_UNMOUNT 0x0001 /* vfs_write_suspend: fail if the filesystem is being unmounted */ #define VREF(vp) vref(vp) #ifdef DIAGNOSTIC #define VATTR_NULL(vap) vattr_null(vap) #else #define VATTR_NULL(vap) (*(vap) = va_null) /* initialize a vattr */ #endif /* DIAGNOSTIC */ #define NULLVP ((struct vnode *)NULL) /* * Global vnode data. */ extern struct vnode *rootvnode; /* root (i.e. "/") vnode */ extern struct mount *rootdevmp; /* "/dev" mount */ extern u_long desiredvnodes; /* number of vnodes desired */ extern struct uma_zone *namei_zone; extern struct vattr va_null; /* predefined null vattr structure */ #define VI_LOCK(vp) mtx_lock(&(vp)->v_interlock) #define VI_LOCK_FLAGS(vp, flags) mtx_lock_flags(&(vp)->v_interlock, (flags)) #define VI_TRYLOCK(vp) mtx_trylock(&(vp)->v_interlock) #define VI_UNLOCK(vp) mtx_unlock(&(vp)->v_interlock) #define VI_MTX(vp) (&(vp)->v_interlock) #define VN_LOCK_AREC(vp) lockallowrecurse((vp)->v_vnlock) #define VN_LOCK_ASHARE(vp) lockallowshare((vp)->v_vnlock) #define VN_LOCK_DSHARE(vp) lockdisableshare((vp)->v_vnlock) #endif /* _KERNEL */ /* * Mods for extensibility. */ /* * Flags for vdesc_flags: */ #define VDESC_MAX_VPS 16 /* Low order 16 flag bits are reserved for willrele flags for vp arguments. */ #define VDESC_VP0_WILLRELE 0x0001 #define VDESC_VP1_WILLRELE 0x0002 #define VDESC_VP2_WILLRELE 0x0004 #define VDESC_VP3_WILLRELE 0x0008 /* * A generic structure. * This can be used by bypass routines to identify generic arguments. */ struct vop_generic_args { struct vnodeop_desc *a_desc; /* other random data follows, presumably */ }; typedef int vop_bypass_t(struct vop_generic_args *); /* * VDESC_NO_OFFSET is used to identify the end of the offset list * and in places where no such field exists. */ #define VDESC_NO_OFFSET -1 /* * This structure describes the vnode operation taking place. */ struct vnodeop_desc { char *vdesc_name; /* a readable name for debugging */ int vdesc_flags; /* VDESC_* flags */ int vdesc_vop_offset; vop_bypass_t *vdesc_call; /* Function to call */ /* * These ops are used by bypass routines to map and locate arguments. * Creds and procs are not needed in bypass routines, but sometimes * they are useful to (for example) transport layers. * Nameidata is useful because it has a cred in it. */ int *vdesc_vp_offsets; /* list ended by VDESC_NO_OFFSET */ int vdesc_vpp_offset; /* return vpp location */ int vdesc_cred_offset; /* cred location, if any */ int vdesc_thread_offset; /* thread location, if any */ int vdesc_componentname_offset; /* if any */ }; #ifdef _KERNEL /* * A list of all the operation descs. */ extern struct vnodeop_desc *vnodeop_descs[]; #define VOPARG_OFFSETOF(s_type, field) __offsetof(s_type, field) #define VOPARG_OFFSETTO(s_type, s_offset, struct_p) \ ((s_type)(((char*)(struct_p)) + (s_offset))) #ifdef DEBUG_VFS_LOCKS /* * Support code to aid in debugging VFS locking problems. Not totally * reliable since if the thread sleeps between changing the lock * state and checking it with the assert, some other thread could * change the state. They are good enough for debugging a single * filesystem using a single-threaded test. Note that the unreliability is * limited to false negatives; efforts were made to ensure that false * positives cannot occur. */ void assert_vi_locked(struct vnode *vp, const char *str); void assert_vi_unlocked(struct vnode *vp, const char *str); void assert_vop_elocked(struct vnode *vp, const char *str); void assert_vop_locked(struct vnode *vp, const char *str); void assert_vop_unlocked(struct vnode *vp, const char *str); #define ASSERT_VI_LOCKED(vp, str) assert_vi_locked((vp), (str)) #define ASSERT_VI_UNLOCKED(vp, str) assert_vi_unlocked((vp), (str)) #define ASSERT_VOP_ELOCKED(vp, str) assert_vop_elocked((vp), (str)) #define ASSERT_VOP_LOCKED(vp, str) assert_vop_locked((vp), (str)) #define ASSERT_VOP_UNLOCKED(vp, str) assert_vop_unlocked((vp), (str)) #define ASSERT_VOP_IN_SEQC(vp) do { \ struct vnode *_vp = (vp); \ \ VNPASS(seqc_in_modify(_vp->v_seqc), _vp); \ } while (0) #define ASSERT_VOP_NOT_IN_SEQC(vp) do { \ struct vnode *_vp = (vp); \ \ VNPASS(!seqc_in_modify(_vp->v_seqc), _vp); \ } while (0) #else /* !DEBUG_VFS_LOCKS */ #define ASSERT_VI_LOCKED(vp, str) ((void)0) #define ASSERT_VI_UNLOCKED(vp, str) ((void)0) #define ASSERT_VOP_ELOCKED(vp, str) ((void)0) #define ASSERT_VOP_LOCKED(vp, str) ((void)0) #define ASSERT_VOP_UNLOCKED(vp, str) ((void)0) #define ASSERT_VOP_IN_SEQC(vp) ((void)0) #define ASSERT_VOP_NOT_IN_SEQC(vp) ((void)0) #endif /* DEBUG_VFS_LOCKS */ /* * This call works for vnodes in the kernel. */ #define VCALL(c) ((c)->a_desc->vdesc_call(c)) #define DOINGASYNC(vp) \ (((vp)->v_mount->mnt_kern_flag & MNTK_ASYNC) != 0 && \ ((curthread->td_pflags & TDP_SYNCIO) == 0)) /* * VMIO support inline */ extern int vmiodirenable; static __inline int vn_canvmio(struct vnode *vp) { if (vp && (vp->v_type == VREG || (vmiodirenable && vp->v_type == VDIR))) return(TRUE); return(FALSE); } /* * Finally, include the default set of vnode operations. */ typedef void vop_getpages_iodone_t(void *, vm_page_t *, int, int); #include "vnode_if.h" /* vn_open_flags */ #define VN_OPEN_NOAUDIT 0x00000001 #define VN_OPEN_NOCAPCHECK 0x00000002 #define VN_OPEN_NAMECACHE 0x00000004 #define VN_OPEN_INVFS 0x00000008 /* * Public vnode manipulation functions. */ struct componentname; struct file; struct mount; struct nameidata; struct ostat; struct freebsd11_stat; struct thread; struct proc; struct stat; struct nstat; struct ucred; struct uio; struct vattr; struct vfsops; struct vnode; typedef int (*vn_get_ino_t)(struct mount *, void *, int, struct vnode **); int bnoreuselist(struct bufv *bufv, struct bufobj *bo, daddr_t startn, daddr_t endn); /* cache_* may belong in namei.h. */ void cache_changesize(u_long newhashsize); #define cache_enter(dvp, vp, cnp) \ cache_enter_time(dvp, vp, cnp, NULL, NULL) void cache_enter_time(struct vnode *dvp, struct vnode *vp, struct componentname *cnp, struct timespec *tsp, struct timespec *dtsp); int cache_lookup(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, struct timespec *tsp, int *ticksp); void cache_vnode_init(struct vnode *vp); void cache_purge(struct vnode *vp); void cache_purge_vgone(struct vnode *vp); void cache_purge_negative(struct vnode *vp); void cache_purgevfs(struct mount *mp, bool force); int change_dir(struct vnode *vp, struct thread *td); void cvtstat(struct stat *st, struct ostat *ost); void freebsd11_cvtnstat(struct stat *sb, struct nstat *nsb); int freebsd11_cvtstat(struct stat *st, struct freebsd11_stat *ost); int getnewvnode(const char *tag, struct mount *mp, struct vop_vector *vops, struct vnode **vpp); void getnewvnode_reserve(void); void getnewvnode_drop_reserve(void); int insmntque1(struct vnode *vp, struct mount *mp, void (*dtr)(struct vnode *, void *), void *dtr_arg); int insmntque(struct vnode *vp, struct mount *mp); u_quad_t init_va_filerev(void); int speedup_syncer(void); int vn_vptocnp(struct vnode **vp, struct ucred *cred, char *buf, size_t *buflen); int vn_getcwd(struct thread *td, char *buf, char **retbuf, size_t *buflen); int vn_fullpath(struct thread *td, struct vnode *vn, char **retbuf, char **freebuf); int vn_fullpath_global(struct thread *td, struct vnode *vn, char **retbuf, char **freebuf); struct vnode * vn_dir_dd_ino(struct vnode *vp); int vn_commname(struct vnode *vn, char *buf, u_int buflen); int vn_path_to_global_path(struct thread *td, struct vnode *vp, char *path, u_int pathlen); int vaccess(enum vtype type, mode_t file_mode, uid_t file_uid, gid_t file_gid, accmode_t accmode, struct ucred *cred); int vaccess_vexec_smr(mode_t file_mode, uid_t file_uid, gid_t file_gid, struct ucred *cred); int vaccess_acl_nfs4(enum vtype type, uid_t file_uid, gid_t file_gid, struct acl *aclp, accmode_t accmode, struct ucred *cred); int vaccess_acl_posix1e(enum vtype type, uid_t file_uid, gid_t file_gid, struct acl *acl, accmode_t accmode, struct ucred *cred); void vattr_null(struct vattr *vap); void vlazy(struct vnode *); void vdrop(struct vnode *); void vdropl(struct vnode *); int vflush(struct mount *mp, int rootrefs, int flags, struct thread *td); int vget(struct vnode *vp, int flags); enum vgetstate vget_prep_smr(struct vnode *vp); enum vgetstate vget_prep(struct vnode *vp); int vget_finish(struct vnode *vp, int flags, enum vgetstate vs); void vget_finish_ref(struct vnode *vp, enum vgetstate vs); void vget_abort(struct vnode *vp, enum vgetstate vs); void vgone(struct vnode *vp); void vhold(struct vnode *); void vholdl(struct vnode *); void vholdnz(struct vnode *); bool vhold_smr(struct vnode *); void vinactive(struct vnode *vp); int vinvalbuf(struct vnode *vp, int save, int slpflag, int slptimeo); int vtruncbuf(struct vnode *vp, off_t length, int blksize); void v_inval_buf_range(struct vnode *vp, daddr_t startlbn, daddr_t endlbn, int blksize); void vunref(struct vnode *); void vn_printf(struct vnode *vp, const char *fmt, ...) __printflike(2,3); int vrecycle(struct vnode *vp); int vrecyclel(struct vnode *vp); int vn_bmap_seekhole(struct vnode *vp, u_long cmd, off_t *off, struct ucred *cred); int vn_close(struct vnode *vp, int flags, struct ucred *file_cred, struct thread *td); int vn_copy_file_range(struct vnode *invp, off_t *inoffp, struct vnode *outvp, off_t *outoffp, size_t *lenp, unsigned int flags, struct ucred *incred, struct ucred *outcred, struct thread *fsize_td); void vn_finished_write(struct mount *mp); void vn_finished_secondary_write(struct mount *mp); int vn_fsync_buf(struct vnode *vp, int waitfor); int vn_generic_copy_file_range(struct vnode *invp, off_t *inoffp, struct vnode *outvp, off_t *outoffp, size_t *lenp, unsigned int flags, struct ucred *incred, struct ucred *outcred, struct thread *fsize_td); int vn_need_pageq_flush(struct vnode *vp); -int vn_isdisk(struct vnode *vp, int *errp); +bool vn_isdisk_error(struct vnode *vp, int *errp); +bool vn_isdisk(struct vnode *vp); int _vn_lock(struct vnode *vp, int flags, const char *file, int line); #define vn_lock(vp, flags) _vn_lock(vp, flags, __FILE__, __LINE__) int vn_open(struct nameidata *ndp, int *flagp, int cmode, struct file *fp); int vn_open_cred(struct nameidata *ndp, int *flagp, int cmode, u_int vn_open_flags, struct ucred *cred, struct file *fp); int vn_open_vnode(struct vnode *vp, int fmode, struct ucred *cred, struct thread *td, struct file *fp); void vn_pages_remove(struct vnode *vp, vm_pindex_t start, vm_pindex_t end); int vn_pollrecord(struct vnode *vp, struct thread *p, int events); int vn_rdwr(enum uio_rw rw, struct vnode *vp, void *base, int len, off_t offset, enum uio_seg segflg, int ioflg, struct ucred *active_cred, struct ucred *file_cred, ssize_t *aresid, struct thread *td); int vn_rdwr_inchunks(enum uio_rw rw, struct vnode *vp, void *base, size_t len, off_t offset, enum uio_seg segflg, int ioflg, struct ucred *active_cred, struct ucred *file_cred, size_t *aresid, struct thread *td); int vn_rlimit_fsize(const struct vnode *vn, const struct uio *uio, struct thread *td); int vn_start_write(struct vnode *vp, struct mount **mpp, int flags); int vn_start_secondary_write(struct vnode *vp, struct mount **mpp, int flags); int vn_truncate_locked(struct vnode *vp, off_t length, bool sync, struct ucred *cred); int vn_writechk(struct vnode *vp); int vn_extattr_get(struct vnode *vp, int ioflg, int attrnamespace, const char *attrname, int *buflen, char *buf, struct thread *td); int vn_extattr_set(struct vnode *vp, int ioflg, int attrnamespace, const char *attrname, int buflen, char *buf, struct thread *td); int vn_extattr_rm(struct vnode *vp, int ioflg, int attrnamespace, const char *attrname, struct thread *td); int vn_vget_ino(struct vnode *vp, ino_t ino, int lkflags, struct vnode **rvp); int vn_vget_ino_gen(struct vnode *vp, vn_get_ino_t alloc, void *alloc_arg, int lkflags, struct vnode **rvp); int vn_utimes_perm(struct vnode *vp, struct vattr *vap, struct ucred *cred, struct thread *td); int vn_io_fault_uiomove(char *data, int xfersize, struct uio *uio); int vn_io_fault_pgmove(vm_page_t ma[], vm_offset_t offset, int xfersize, struct uio *uio); void vn_seqc_write_begin_unheld_locked(struct vnode *vp); void vn_seqc_write_begin_unheld(struct vnode *vp); void vn_seqc_write_begin_locked(struct vnode *vp); void vn_seqc_write_begin(struct vnode *vp); void vn_seqc_write_end_locked(struct vnode *vp); void vn_seqc_write_end(struct vnode *vp); #define vn_seqc_read_any(vp) seqc_read_any(&(vp)->v_seqc) #define vn_seqc_consistent(vp, seq) seqc_consistent(&(vp)->v_seqc, seq) #define vn_rangelock_unlock(vp, cookie) \ rangelock_unlock(&(vp)->v_rl, (cookie), VI_MTX(vp)) #define vn_rangelock_unlock_range(vp, cookie, start, end) \ rangelock_unlock_range(&(vp)->v_rl, (cookie), (start), (end), \ VI_MTX(vp)) #define vn_rangelock_rlock(vp, start, end) \ rangelock_rlock(&(vp)->v_rl, (start), (end), VI_MTX(vp)) #define vn_rangelock_tryrlock(vp, start, end) \ rangelock_tryrlock(&(vp)->v_rl, (start), (end), VI_MTX(vp)) #define vn_rangelock_wlock(vp, start, end) \ rangelock_wlock(&(vp)->v_rl, (start), (end), VI_MTX(vp)) #define vn_rangelock_trywlock(vp, start, end) \ rangelock_trywlock(&(vp)->v_rl, (start), (end), VI_MTX(vp)) int vfs_cache_lookup(struct vop_lookup_args *ap); int vfs_cache_root(struct mount *mp, int flags, struct vnode **vpp); void vfs_timestamp(struct timespec *); void vfs_write_resume(struct mount *mp, int flags); int vfs_write_suspend(struct mount *mp, int flags); int vfs_write_suspend_umnt(struct mount *mp); void vnlru_free(int, struct vfsops *); int vop_stdbmap(struct vop_bmap_args *); int vop_stdfdatasync_buf(struct vop_fdatasync_args *); int vop_stdfsync(struct vop_fsync_args *); int vop_stdgetwritemount(struct vop_getwritemount_args *); int vop_stdgetpages(struct vop_getpages_args *); int vop_stdinactive(struct vop_inactive_args *); int vop_stdioctl(struct vop_ioctl_args *); int vop_stdneed_inactive(struct vop_need_inactive_args *); int vop_stdkqfilter(struct vop_kqfilter_args *); int vop_stdlock(struct vop_lock1_args *); int vop_stdunlock(struct vop_unlock_args *); int vop_stdislocked(struct vop_islocked_args *); int vop_lock(struct vop_lock1_args *); int vop_unlock(struct vop_unlock_args *); int vop_islocked(struct vop_islocked_args *); int vop_stdputpages(struct vop_putpages_args *); int vop_nopoll(struct vop_poll_args *); int vop_stdaccess(struct vop_access_args *ap); int vop_stdaccessx(struct vop_accessx_args *ap); int vop_stdadvise(struct vop_advise_args *ap); int vop_stdadvlock(struct vop_advlock_args *ap); int vop_stdadvlockasync(struct vop_advlockasync_args *ap); int vop_stdadvlockpurge(struct vop_advlockpurge_args *ap); int vop_stdallocate(struct vop_allocate_args *ap); int vop_stdset_text(struct vop_set_text_args *ap); int vop_stdpathconf(struct vop_pathconf_args *); int vop_stdpoll(struct vop_poll_args *); int vop_stdvptocnp(struct vop_vptocnp_args *ap); int vop_stdvptofh(struct vop_vptofh_args *ap); int vop_stdunp_bind(struct vop_unp_bind_args *ap); int vop_stdunp_connect(struct vop_unp_connect_args *ap); int vop_stdunp_detach(struct vop_unp_detach_args *ap); int vop_eopnotsupp(struct vop_generic_args *ap); int vop_ebadf(struct vop_generic_args *ap); int vop_einval(struct vop_generic_args *ap); int vop_enoent(struct vop_generic_args *ap); int vop_enotty(struct vop_generic_args *ap); int vop_null(struct vop_generic_args *ap); int vop_panic(struct vop_generic_args *ap); int dead_poll(struct vop_poll_args *ap); int dead_read(struct vop_read_args *ap); int dead_write(struct vop_write_args *ap); /* These are called from within the actual VOPS. */ void vop_close_post(void *a, int rc); void vop_create_pre(void *a); void vop_create_post(void *a, int rc); void vop_whiteout_pre(void *a); void vop_whiteout_post(void *a, int rc); void vop_deleteextattr_pre(void *a); void vop_deleteextattr_post(void *a, int rc); void vop_link_pre(void *a); void vop_link_post(void *a, int rc); void vop_lookup_post(void *a, int rc); void vop_lookup_pre(void *a); void vop_mkdir_pre(void *a); void vop_mkdir_post(void *a, int rc); void vop_mknod_pre(void *a); void vop_mknod_post(void *a, int rc); void vop_open_post(void *a, int rc); void vop_read_post(void *a, int rc); void vop_readdir_post(void *a, int rc); void vop_reclaim_post(void *a, int rc); void vop_remove_pre(void *a); void vop_remove_post(void *a, int rc); void vop_rename_post(void *a, int rc); void vop_rename_pre(void *a); void vop_rmdir_pre(void *a); void vop_rmdir_post(void *a, int rc); void vop_setattr_pre(void *a); void vop_setattr_post(void *a, int rc); void vop_setacl_pre(void *a); void vop_setacl_post(void *a, int rc); void vop_setextattr_pre(void *a); void vop_setextattr_post(void *a, int rc); void vop_symlink_pre(void *a); void vop_symlink_post(void *a, int rc); int vop_sigdefer(struct vop_vector *vop, struct vop_generic_args *a); #ifdef DEBUG_VFS_LOCKS void vop_fplookup_vexec_debugpre(void *a); void vop_fplookup_vexec_debugpost(void *a, int rc); void vop_strategy_debugpre(void *a); void vop_lock_debugpre(void *a); void vop_lock_debugpost(void *a, int rc); void vop_unlock_debugpre(void *a); void vop_need_inactive_debugpre(void *a); void vop_need_inactive_debugpost(void *a, int rc); #else #define vop_fplookup_vexec_debugpre(x) do { } while (0) #define vop_fplookup_vexec_debugpost(x, y) do { } while (0) #define vop_strategy_debugpre(x) do { } while (0) #define vop_lock_debugpre(x) do { } while (0) #define vop_lock_debugpost(x, y) do { } while (0) #define vop_unlock_debugpre(x) do { } while (0) #define vop_need_inactive_debugpre(x) do { } while (0) #define vop_need_inactive_debugpost(x, y) do { } while (0) #endif void vop_rename_fail(struct vop_rename_args *ap); #define vop_stat_helper_pre(ap) ({ \ int _error; \ AUDIT_ARG_VNODE1(ap->a_vp); \ _error = mac_vnode_check_stat(ap->a_active_cred, ap->a_file_cred, ap->a_vp);\ if (__predict_true(_error == 0)) \ bzero(ap->a_sb, sizeof(*ap->a_sb)); \ _error; \ }) #define vop_stat_helper_post(ap, error) ({ \ int _error = (error); \ if (priv_check_cred_vfs_generation(ap->a_td->td_ucred)) \ ap->a_sb->st_gen = 0; \ _error; \ }) #define VOP_WRITE_PRE(ap) \ struct vattr va; \ int error; \ off_t osize, ooffset, noffset; \ \ osize = ooffset = noffset = 0; \ if (!VN_KNLIST_EMPTY((ap)->a_vp)) { \ error = VOP_GETATTR((ap)->a_vp, &va, (ap)->a_cred); \ if (error) \ return (error); \ ooffset = (ap)->a_uio->uio_offset; \ osize = (off_t)va.va_size; \ } #define VOP_WRITE_POST(ap, ret) \ noffset = (ap)->a_uio->uio_offset; \ if (noffset > ooffset && !VN_KNLIST_EMPTY((ap)->a_vp)) { \ VFS_KNOTE_LOCKED((ap)->a_vp, NOTE_WRITE \ | (noffset > osize ? NOTE_EXTEND : 0)); \ } #define VOP_LOCK(vp, flags) VOP_LOCK1(vp, flags, __FILE__, __LINE__) #ifdef INVARIANTS #define VOP_ADD_WRITECOUNT_CHECKED(vp, cnt) \ do { \ int error_; \ \ error_ = VOP_ADD_WRITECOUNT((vp), (cnt)); \ VNASSERT(error_ == 0, (vp), ("VOP_ADD_WRITECOUNT returned %d", \ error_)); \ } while (0) #define VOP_SET_TEXT_CHECKED(vp) \ do { \ int error_; \ \ error_ = VOP_SET_TEXT((vp)); \ VNASSERT(error_ == 0, (vp), ("VOP_SET_TEXT returned %d", \ error_)); \ } while (0) #define VOP_UNSET_TEXT_CHECKED(vp) \ do { \ int error_; \ \ error_ = VOP_UNSET_TEXT((vp)); \ VNASSERT(error_ == 0, (vp), ("VOP_UNSET_TEXT returned %d", \ error_)); \ } while (0) #else #define VOP_ADD_WRITECOUNT_CHECKED(vp, cnt) VOP_ADD_WRITECOUNT((vp), (cnt)) #define VOP_SET_TEXT_CHECKED(vp) VOP_SET_TEXT((vp)) #define VOP_UNSET_TEXT_CHECKED(vp) VOP_UNSET_TEXT((vp)) #endif #define VN_IS_DOOMED(vp) __predict_false((vp)->v_irflag & VIRF_DOOMED) void vput(struct vnode *vp); void vrele(struct vnode *vp); void vref(struct vnode *vp); void vrefact(struct vnode *vp); void v_addpollinfo(struct vnode *vp); static __inline int vrefcnt(struct vnode *vp) { return (vp->v_usecount); } #define vrefl(vp) do { \ ASSERT_VI_LOCKED(vp, __func__); \ vref(vp); \ } while (0) int vnode_create_vobject(struct vnode *vp, off_t size, struct thread *td); void vnode_destroy_vobject(struct vnode *vp); extern struct vop_vector fifo_specops; extern struct vop_vector dead_vnodeops; extern struct vop_vector default_vnodeops; #define VOP_PANIC ((void*)(uintptr_t)vop_panic) #define VOP_NULL ((void*)(uintptr_t)vop_null) #define VOP_EBADF ((void*)(uintptr_t)vop_ebadf) #define VOP_ENOTTY ((void*)(uintptr_t)vop_enotty) #define VOP_EINVAL ((void*)(uintptr_t)vop_einval) #define VOP_ENOENT ((void*)(uintptr_t)vop_enoent) #define VOP_EOPNOTSUPP ((void*)(uintptr_t)vop_eopnotsupp) /* fifo_vnops.c */ int fifo_printinfo(struct vnode *); /* vfs_hash.c */ typedef int vfs_hash_cmp_t(struct vnode *vp, void *arg); void vfs_hash_changesize(u_long newhashsize); int vfs_hash_get(const struct mount *mp, u_int hash, int flags, struct thread *td, struct vnode **vpp, vfs_hash_cmp_t *fn, void *arg); u_int vfs_hash_index(struct vnode *vp); int vfs_hash_insert(struct vnode *vp, u_int hash, int flags, struct thread *td, struct vnode **vpp, vfs_hash_cmp_t *fn, void *arg); void vfs_hash_ref(const struct mount *mp, u_int hash, struct thread *td, struct vnode **vpp, vfs_hash_cmp_t *fn, void *arg); void vfs_hash_rehash(struct vnode *vp, u_int hash); void vfs_hash_remove(struct vnode *vp); int vfs_kqfilter(struct vop_kqfilter_args *); struct dirent; int vfs_read_dirent(struct vop_readdir_args *ap, struct dirent *dp, off_t off); int vfs_emptydir(struct vnode *vp); int vfs_unixify_accmode(accmode_t *accmode); void vfs_unp_reclaim(struct vnode *vp); int setfmode(struct thread *td, struct ucred *cred, struct vnode *vp, int mode); int setfown(struct thread *td, struct ucred *cred, struct vnode *vp, uid_t uid, gid_t gid); int vn_chmod(struct file *fp, mode_t mode, struct ucred *active_cred, struct thread *td); int vn_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred, struct thread *td); void vn_fsid(struct vnode *vp, struct vattr *va); int vn_dir_check_exec(struct vnode *vp, struct componentname *cnp); #define VOP_UNLOCK_FLAGS(vp, flags) ({ \ struct vnode *_vp = (vp); \ int _flags = (flags); \ int _error; \ \ if ((_flags & ~(LK_INTERLOCK | LK_RELEASE)) != 0) \ panic("%s: unsupported flags %x\n", __func__, flags); \ _error = VOP_UNLOCK(_vp); \ if (_flags & LK_INTERLOCK) \ VI_UNLOCK(_vp); \ _error; \ }) #include #define VFS_VOP_VECTOR_REGISTER(vnodeops) \ SYSINIT(vfs_vector_##vnodeops##_f, SI_SUB_VFS, SI_ORDER_ANY, \ vfs_vector_op_register, &vnodeops) #define VFS_SMR_DECLARE \ extern smr_t vfs_smr #define VFS_SMR() vfs_smr #define vfs_smr_enter() smr_enter(VFS_SMR()) #define vfs_smr_exit() smr_exit(VFS_SMR()) #define vfs_smr_entered_load(ptr) smr_entered_load((ptr), VFS_SMR()) #define VFS_SMR_ASSERT_ENTERED() SMR_ASSERT_ENTERED(VFS_SMR()) #define VFS_SMR_ASSERT_NOT_ENTERED() SMR_ASSERT_NOT_ENTERED(VFS_SMR()) #define VFS_SMR_ZONE_SET(zone) uma_zone_set_smr((zone), VFS_SMR()) #define vn_load_v_data_smr(vp) ({ \ struct vnode *_vp = (vp); \ \ VFS_SMR_ASSERT_ENTERED(); \ atomic_load_ptr(&(_vp)->v_data); \ }) #endif /* _KERNEL */ #endif /* !_SYS_VNODE_H_ */ Index: head/sys/ufs/ffs/ffs_softdep.c =================================================================== --- head/sys/ufs/ffs/ffs_softdep.c (revision 364371) +++ head/sys/ufs/ffs/ffs_softdep.c (revision 364372) @@ -1,14826 +1,14826 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright 1998, 2000 Marshall Kirk McKusick. * Copyright 2009, 2010 Jeffrey W. Roberson * All rights reserved. * * The soft updates code is derived from the appendix of a University * of Michigan technical report (Gregory R. Ganger and Yale N. Patt, * "Soft Updates: A Solution to the Metadata Update Problem in File * Systems", CSE-TR-254-95, August 1995). * * Further information about soft updates can be obtained from: * * Marshall Kirk McKusick http://www.mckusick.com/softdep/ * 1614 Oxford Street mckusick@mckusick.com * Berkeley, CA 94709-1608 +1-510-843-9542 * USA * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR * TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * from: @(#)ffs_softdep.c 9.59 (McKusick) 6/21/00 */ #include __FBSDID("$FreeBSD$"); #include "opt_ffs.h" #include "opt_quota.h" #include "opt_ddb.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define KTR_SUJ 0 /* Define to KTR_SPARE. */ #ifndef SOFTUPDATES int softdep_flushfiles(oldmnt, flags, td) struct mount *oldmnt; int flags; struct thread *td; { panic("softdep_flushfiles called"); } int softdep_mount(devvp, mp, fs, cred) struct vnode *devvp; struct mount *mp; struct fs *fs; struct ucred *cred; { return (0); } void softdep_initialize() { return; } void softdep_uninitialize() { return; } void softdep_unmount(mp) struct mount *mp; { panic("softdep_unmount called"); } void softdep_setup_sbupdate(ump, fs, bp) struct ufsmount *ump; struct fs *fs; struct buf *bp; { panic("softdep_setup_sbupdate called"); } void softdep_setup_inomapdep(bp, ip, newinum, mode) struct buf *bp; struct inode *ip; ino_t newinum; int mode; { panic("softdep_setup_inomapdep called"); } void softdep_setup_blkmapdep(bp, mp, newblkno, frags, oldfrags) struct buf *bp; struct mount *mp; ufs2_daddr_t newblkno; int frags; int oldfrags; { panic("softdep_setup_blkmapdep called"); } void softdep_setup_allocdirect(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp) struct inode *ip; ufs_lbn_t lbn; ufs2_daddr_t newblkno; ufs2_daddr_t oldblkno; long newsize; long oldsize; struct buf *bp; { panic("softdep_setup_allocdirect called"); } void softdep_setup_allocext(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp) struct inode *ip; ufs_lbn_t lbn; ufs2_daddr_t newblkno; ufs2_daddr_t oldblkno; long newsize; long oldsize; struct buf *bp; { panic("softdep_setup_allocext called"); } void softdep_setup_allocindir_page(ip, lbn, bp, ptrno, newblkno, oldblkno, nbp) struct inode *ip; ufs_lbn_t lbn; struct buf *bp; int ptrno; ufs2_daddr_t newblkno; ufs2_daddr_t oldblkno; struct buf *nbp; { panic("softdep_setup_allocindir_page called"); } void softdep_setup_allocindir_meta(nbp, ip, bp, ptrno, newblkno) struct buf *nbp; struct inode *ip; struct buf *bp; int ptrno; ufs2_daddr_t newblkno; { panic("softdep_setup_allocindir_meta called"); } void softdep_journal_freeblocks(ip, cred, length, flags) struct inode *ip; struct ucred *cred; off_t length; int flags; { panic("softdep_journal_freeblocks called"); } void softdep_journal_fsync(ip) struct inode *ip; { panic("softdep_journal_fsync called"); } void softdep_setup_freeblocks(ip, length, flags) struct inode *ip; off_t length; int flags; { panic("softdep_setup_freeblocks called"); } void softdep_freefile(pvp, ino, mode) struct vnode *pvp; ino_t ino; int mode; { panic("softdep_freefile called"); } int softdep_setup_directory_add(bp, dp, diroffset, newinum, newdirbp, isnewblk) struct buf *bp; struct inode *dp; off_t diroffset; ino_t newinum; struct buf *newdirbp; int isnewblk; { panic("softdep_setup_directory_add called"); } void softdep_change_directoryentry_offset(bp, dp, base, oldloc, newloc, entrysize) struct buf *bp; struct inode *dp; caddr_t base; caddr_t oldloc; caddr_t newloc; int entrysize; { panic("softdep_change_directoryentry_offset called"); } void softdep_setup_remove(bp, dp, ip, isrmdir) struct buf *bp; struct inode *dp; struct inode *ip; int isrmdir; { panic("softdep_setup_remove called"); } void softdep_setup_directory_change(bp, dp, ip, newinum, isrmdir) struct buf *bp; struct inode *dp; struct inode *ip; ino_t newinum; int isrmdir; { panic("softdep_setup_directory_change called"); } void softdep_setup_blkfree(mp, bp, blkno, frags, wkhd) struct mount *mp; struct buf *bp; ufs2_daddr_t blkno; int frags; struct workhead *wkhd; { panic("%s called", __FUNCTION__); } void softdep_setup_inofree(mp, bp, ino, wkhd) struct mount *mp; struct buf *bp; ino_t ino; struct workhead *wkhd; { panic("%s called", __FUNCTION__); } void softdep_setup_unlink(dp, ip) struct inode *dp; struct inode *ip; { panic("%s called", __FUNCTION__); } void softdep_setup_link(dp, ip) struct inode *dp; struct inode *ip; { panic("%s called", __FUNCTION__); } void softdep_revert_link(dp, ip) struct inode *dp; struct inode *ip; { panic("%s called", __FUNCTION__); } void softdep_setup_rmdir(dp, ip) struct inode *dp; struct inode *ip; { panic("%s called", __FUNCTION__); } void softdep_revert_rmdir(dp, ip) struct inode *dp; struct inode *ip; { panic("%s called", __FUNCTION__); } void softdep_setup_create(dp, ip) struct inode *dp; struct inode *ip; { panic("%s called", __FUNCTION__); } void softdep_revert_create(dp, ip) struct inode *dp; struct inode *ip; { panic("%s called", __FUNCTION__); } void softdep_setup_mkdir(dp, ip) struct inode *dp; struct inode *ip; { panic("%s called", __FUNCTION__); } void softdep_revert_mkdir(dp, ip) struct inode *dp; struct inode *ip; { panic("%s called", __FUNCTION__); } void softdep_setup_dotdot_link(dp, ip) struct inode *dp; struct inode *ip; { panic("%s called", __FUNCTION__); } int softdep_prealloc(vp, waitok) struct vnode *vp; int waitok; { panic("%s called", __FUNCTION__); } int softdep_journal_lookup(mp, vpp) struct mount *mp; struct vnode **vpp; { return (ENOENT); } void softdep_change_linkcnt(ip) struct inode *ip; { panic("softdep_change_linkcnt called"); } void softdep_load_inodeblock(ip) struct inode *ip; { panic("softdep_load_inodeblock called"); } void softdep_update_inodeblock(ip, bp, waitfor) struct inode *ip; struct buf *bp; int waitfor; { panic("softdep_update_inodeblock called"); } int softdep_fsync(vp) struct vnode *vp; /* the "in_core" copy of the inode */ { return (0); } void softdep_fsync_mountdev(vp) struct vnode *vp; { return; } int softdep_flushworklist(oldmnt, countp, td) struct mount *oldmnt; int *countp; struct thread *td; { *countp = 0; return (0); } int softdep_sync_metadata(struct vnode *vp) { panic("softdep_sync_metadata called"); } int softdep_sync_buf(struct vnode *vp, struct buf *bp, int waitfor) { panic("softdep_sync_buf called"); } int softdep_slowdown(vp) struct vnode *vp; { panic("softdep_slowdown called"); } int softdep_request_cleanup(fs, vp, cred, resource) struct fs *fs; struct vnode *vp; struct ucred *cred; int resource; { return (0); } int softdep_check_suspend(struct mount *mp, struct vnode *devvp, int softdep_depcnt, int softdep_accdepcnt, int secondary_writes, int secondary_accwrites) { struct bufobj *bo; int error; (void) softdep_depcnt, (void) softdep_accdepcnt; bo = &devvp->v_bufobj; ASSERT_BO_WLOCKED(bo); MNT_ILOCK(mp); while (mp->mnt_secondary_writes != 0) { BO_UNLOCK(bo); msleep(&mp->mnt_secondary_writes, MNT_MTX(mp), (PUSER - 1) | PDROP, "secwr", 0); BO_LOCK(bo); MNT_ILOCK(mp); } /* * Reasons for needing more work before suspend: * - Dirty buffers on devvp. * - Secondary writes occurred after start of vnode sync loop */ error = 0; if (bo->bo_numoutput > 0 || bo->bo_dirty.bv_cnt > 0 || secondary_writes != 0 || mp->mnt_secondary_writes != 0 || secondary_accwrites != mp->mnt_secondary_accwrites) error = EAGAIN; BO_UNLOCK(bo); return (error); } void softdep_get_depcounts(struct mount *mp, int *softdepactivep, int *softdepactiveaccp) { (void) mp; *softdepactivep = 0; *softdepactiveaccp = 0; } void softdep_buf_append(bp, wkhd) struct buf *bp; struct workhead *wkhd; { panic("softdep_buf_appendwork called"); } void softdep_inode_append(ip, cred, wkhd) struct inode *ip; struct ucred *cred; struct workhead *wkhd; { panic("softdep_inode_appendwork called"); } void softdep_freework(wkhd) struct workhead *wkhd; { panic("softdep_freework called"); } #else FEATURE(softupdates, "FFS soft-updates support"); static SYSCTL_NODE(_debug, OID_AUTO, softdep, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "soft updates stats"); static SYSCTL_NODE(_debug_softdep, OID_AUTO, total, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "total dependencies allocated"); static SYSCTL_NODE(_debug_softdep, OID_AUTO, highuse, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "high use dependencies allocated"); static SYSCTL_NODE(_debug_softdep, OID_AUTO, current, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "current dependencies allocated"); static SYSCTL_NODE(_debug_softdep, OID_AUTO, write, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "current dependencies written"); unsigned long dep_current[D_LAST + 1]; unsigned long dep_highuse[D_LAST + 1]; unsigned long dep_total[D_LAST + 1]; unsigned long dep_write[D_LAST + 1]; #define SOFTDEP_TYPE(type, str, long) \ static MALLOC_DEFINE(M_ ## type, #str, long); \ SYSCTL_ULONG(_debug_softdep_total, OID_AUTO, str, CTLFLAG_RD, \ &dep_total[D_ ## type], 0, ""); \ SYSCTL_ULONG(_debug_softdep_current, OID_AUTO, str, CTLFLAG_RD, \ &dep_current[D_ ## type], 0, ""); \ SYSCTL_ULONG(_debug_softdep_highuse, OID_AUTO, str, CTLFLAG_RD, \ &dep_highuse[D_ ## type], 0, ""); \ SYSCTL_ULONG(_debug_softdep_write, OID_AUTO, str, CTLFLAG_RD, \ &dep_write[D_ ## type], 0, ""); SOFTDEP_TYPE(PAGEDEP, pagedep, "File page dependencies"); SOFTDEP_TYPE(INODEDEP, inodedep, "Inode dependencies"); SOFTDEP_TYPE(BMSAFEMAP, bmsafemap, "Block or frag allocated from cyl group map"); SOFTDEP_TYPE(NEWBLK, newblk, "New block or frag allocation dependency"); SOFTDEP_TYPE(ALLOCDIRECT, allocdirect, "Block or frag dependency for an inode"); SOFTDEP_TYPE(INDIRDEP, indirdep, "Indirect block dependencies"); SOFTDEP_TYPE(ALLOCINDIR, allocindir, "Block dependency for an indirect block"); SOFTDEP_TYPE(FREEFRAG, freefrag, "Previously used frag for an inode"); SOFTDEP_TYPE(FREEBLKS, freeblks, "Blocks freed from an inode"); SOFTDEP_TYPE(FREEFILE, freefile, "Inode deallocated"); SOFTDEP_TYPE(DIRADD, diradd, "New directory entry"); SOFTDEP_TYPE(MKDIR, mkdir, "New directory"); SOFTDEP_TYPE(DIRREM, dirrem, "Directory entry deleted"); SOFTDEP_TYPE(NEWDIRBLK, newdirblk, "Unclaimed new directory block"); SOFTDEP_TYPE(FREEWORK, freework, "free an inode block"); SOFTDEP_TYPE(FREEDEP, freedep, "track a block free"); SOFTDEP_TYPE(JADDREF, jaddref, "Journal inode ref add"); SOFTDEP_TYPE(JREMREF, jremref, "Journal inode ref remove"); SOFTDEP_TYPE(JMVREF, jmvref, "Journal inode ref move"); SOFTDEP_TYPE(JNEWBLK, jnewblk, "Journal new block"); SOFTDEP_TYPE(JFREEBLK, jfreeblk, "Journal free block"); SOFTDEP_TYPE(JFREEFRAG, jfreefrag, "Journal free frag"); SOFTDEP_TYPE(JSEG, jseg, "Journal segment"); SOFTDEP_TYPE(JSEGDEP, jsegdep, "Journal segment complete"); SOFTDEP_TYPE(SBDEP, sbdep, "Superblock write dependency"); SOFTDEP_TYPE(JTRUNC, jtrunc, "Journal inode truncation"); SOFTDEP_TYPE(JFSYNC, jfsync, "Journal fsync complete"); static MALLOC_DEFINE(M_SENTINEL, "sentinel", "Worklist sentinel"); static MALLOC_DEFINE(M_SAVEDINO, "savedino", "Saved inodes"); static MALLOC_DEFINE(M_JBLOCKS, "jblocks", "Journal block locations"); static MALLOC_DEFINE(M_MOUNTDATA, "softdep", "Softdep per-mount data"); #define M_SOFTDEP_FLAGS (M_WAITOK) /* * translate from workitem type to memory type * MUST match the defines above, such that memtype[D_XXX] == M_XXX */ static struct malloc_type *memtype[] = { NULL, M_PAGEDEP, M_INODEDEP, M_BMSAFEMAP, M_NEWBLK, M_ALLOCDIRECT, M_INDIRDEP, M_ALLOCINDIR, M_FREEFRAG, M_FREEBLKS, M_FREEFILE, M_DIRADD, M_MKDIR, M_DIRREM, M_NEWDIRBLK, M_FREEWORK, M_FREEDEP, M_JADDREF, M_JREMREF, M_JMVREF, M_JNEWBLK, M_JFREEBLK, M_JFREEFRAG, M_JSEG, M_JSEGDEP, M_SBDEP, M_JTRUNC, M_JFSYNC, M_SENTINEL }; #define DtoM(type) (memtype[type]) /* * Names of malloc types. */ #define TYPENAME(type) \ ((unsigned)(type) <= D_LAST && (unsigned)(type) >= D_FIRST ? \ memtype[type]->ks_shortdesc : "???") /* * End system adaptation definitions. */ #define DOTDOT_OFFSET offsetof(struct dirtemplate, dotdot_ino) #define DOT_OFFSET offsetof(struct dirtemplate, dot_ino) /* * Internal function prototypes. */ static void check_clear_deps(struct mount *); static void softdep_error(char *, int); static int softdep_process_worklist(struct mount *, int); static int softdep_waitidle(struct mount *, int); static void drain_output(struct vnode *); static struct buf *getdirtybuf(struct buf *, struct rwlock *, int); static int check_inodedep_free(struct inodedep *); static void clear_remove(struct mount *); static void clear_inodedeps(struct mount *); static void unlinked_inodedep(struct mount *, struct inodedep *); static void clear_unlinked_inodedep(struct inodedep *); static struct inodedep *first_unlinked_inodedep(struct ufsmount *); static int flush_pagedep_deps(struct vnode *, struct mount *, struct diraddhd *); static int free_pagedep(struct pagedep *); static int flush_newblk_dep(struct vnode *, struct mount *, ufs_lbn_t); static int flush_inodedep_deps(struct vnode *, struct mount *, ino_t); static int flush_deplist(struct allocdirectlst *, int, int *); static int sync_cgs(struct mount *, int); static int handle_written_filepage(struct pagedep *, struct buf *, int); static int handle_written_sbdep(struct sbdep *, struct buf *); static void initiate_write_sbdep(struct sbdep *); static void diradd_inode_written(struct diradd *, struct inodedep *); static int handle_written_indirdep(struct indirdep *, struct buf *, struct buf**, int); static int handle_written_inodeblock(struct inodedep *, struct buf *, int); static int jnewblk_rollforward(struct jnewblk *, struct fs *, struct cg *, uint8_t *); static int handle_written_bmsafemap(struct bmsafemap *, struct buf *, int); static void handle_written_jaddref(struct jaddref *); static void handle_written_jremref(struct jremref *); static void handle_written_jseg(struct jseg *, struct buf *); static void handle_written_jnewblk(struct jnewblk *); static void handle_written_jblkdep(struct jblkdep *); static void handle_written_jfreefrag(struct jfreefrag *); static void complete_jseg(struct jseg *); static void complete_jsegs(struct jseg *); static void jseg_write(struct ufsmount *ump, struct jseg *, uint8_t *); static void jaddref_write(struct jaddref *, struct jseg *, uint8_t *); static void jremref_write(struct jremref *, struct jseg *, uint8_t *); static void jmvref_write(struct jmvref *, struct jseg *, uint8_t *); static void jtrunc_write(struct jtrunc *, struct jseg *, uint8_t *); static void jfsync_write(struct jfsync *, struct jseg *, uint8_t *data); static void jnewblk_write(struct jnewblk *, struct jseg *, uint8_t *); static void jfreeblk_write(struct jfreeblk *, struct jseg *, uint8_t *); static void jfreefrag_write(struct jfreefrag *, struct jseg *, uint8_t *); static inline void inoref_write(struct inoref *, struct jseg *, struct jrefrec *); static void handle_allocdirect_partdone(struct allocdirect *, struct workhead *); static struct jnewblk *cancel_newblk(struct newblk *, struct worklist *, struct workhead *); static void indirdep_complete(struct indirdep *); static int indirblk_lookup(struct mount *, ufs2_daddr_t); static void indirblk_insert(struct freework *); static void indirblk_remove(struct freework *); static void handle_allocindir_partdone(struct allocindir *); static void initiate_write_filepage(struct pagedep *, struct buf *); static void initiate_write_indirdep(struct indirdep*, struct buf *); static void handle_written_mkdir(struct mkdir *, int); static int jnewblk_rollback(struct jnewblk *, struct fs *, struct cg *, uint8_t *); static void initiate_write_bmsafemap(struct bmsafemap *, struct buf *); static void initiate_write_inodeblock_ufs1(struct inodedep *, struct buf *); static void initiate_write_inodeblock_ufs2(struct inodedep *, struct buf *); static void handle_workitem_freefile(struct freefile *); static int handle_workitem_remove(struct dirrem *, int); static struct dirrem *newdirrem(struct buf *, struct inode *, struct inode *, int, struct dirrem **); static struct indirdep *indirdep_lookup(struct mount *, struct inode *, struct buf *); static void cancel_indirdep(struct indirdep *, struct buf *, struct freeblks *); static void free_indirdep(struct indirdep *); static void free_diradd(struct diradd *, struct workhead *); static void merge_diradd(struct inodedep *, struct diradd *); static void complete_diradd(struct diradd *); static struct diradd *diradd_lookup(struct pagedep *, int); static struct jremref *cancel_diradd_dotdot(struct inode *, struct dirrem *, struct jremref *); static struct jremref *cancel_mkdir_dotdot(struct inode *, struct dirrem *, struct jremref *); static void cancel_diradd(struct diradd *, struct dirrem *, struct jremref *, struct jremref *, struct jremref *); static void dirrem_journal(struct dirrem *, struct jremref *, struct jremref *, struct jremref *); static void cancel_allocindir(struct allocindir *, struct buf *bp, struct freeblks *, int); static int setup_trunc_indir(struct freeblks *, struct inode *, ufs_lbn_t, ufs_lbn_t, ufs2_daddr_t); static void complete_trunc_indir(struct freework *); static void trunc_indirdep(struct indirdep *, struct freeblks *, struct buf *, int); static void complete_mkdir(struct mkdir *); static void free_newdirblk(struct newdirblk *); static void free_jremref(struct jremref *); static void free_jaddref(struct jaddref *); static void free_jsegdep(struct jsegdep *); static void free_jsegs(struct jblocks *); static void rele_jseg(struct jseg *); static void free_jseg(struct jseg *, struct jblocks *); static void free_jnewblk(struct jnewblk *); static void free_jblkdep(struct jblkdep *); static void free_jfreefrag(struct jfreefrag *); static void free_freedep(struct freedep *); static void journal_jremref(struct dirrem *, struct jremref *, struct inodedep *); static void cancel_jnewblk(struct jnewblk *, struct workhead *); static int cancel_jaddref(struct jaddref *, struct inodedep *, struct workhead *); static void cancel_jfreefrag(struct jfreefrag *); static inline void setup_freedirect(struct freeblks *, struct inode *, int, int); static inline void setup_freeext(struct freeblks *, struct inode *, int, int); static inline void setup_freeindir(struct freeblks *, struct inode *, int, ufs_lbn_t, int); static inline struct freeblks *newfreeblks(struct mount *, struct inode *); static void freeblks_free(struct ufsmount *, struct freeblks *, int); static void indir_trunc(struct freework *, ufs2_daddr_t, ufs_lbn_t); static ufs2_daddr_t blkcount(struct fs *, ufs2_daddr_t, off_t); static int trunc_check_buf(struct buf *, int *, ufs_lbn_t, int, int); static void trunc_dependencies(struct inode *, struct freeblks *, ufs_lbn_t, int, int); static void trunc_pages(struct inode *, off_t, ufs2_daddr_t, int); static int cancel_pagedep(struct pagedep *, struct freeblks *, int); static int deallocate_dependencies(struct buf *, struct freeblks *, int); static void newblk_freefrag(struct newblk*); static void free_newblk(struct newblk *); static void cancel_allocdirect(struct allocdirectlst *, struct allocdirect *, struct freeblks *); static int check_inode_unwritten(struct inodedep *); static int free_inodedep(struct inodedep *); static void freework_freeblock(struct freework *, u_long); static void freework_enqueue(struct freework *); static int handle_workitem_freeblocks(struct freeblks *, int); static int handle_complete_freeblocks(struct freeblks *, int); static void handle_workitem_indirblk(struct freework *); static void handle_written_freework(struct freework *); static void merge_inode_lists(struct allocdirectlst *,struct allocdirectlst *); static struct worklist *jnewblk_merge(struct worklist *, struct worklist *, struct workhead *); static struct freefrag *setup_allocindir_phase2(struct buf *, struct inode *, struct inodedep *, struct allocindir *, ufs_lbn_t); static struct allocindir *newallocindir(struct inode *, int, ufs2_daddr_t, ufs2_daddr_t, ufs_lbn_t); static void handle_workitem_freefrag(struct freefrag *); static struct freefrag *newfreefrag(struct inode *, ufs2_daddr_t, long, ufs_lbn_t, u_long); static void allocdirect_merge(struct allocdirectlst *, struct allocdirect *, struct allocdirect *); static struct freefrag *allocindir_merge(struct allocindir *, struct allocindir *); static int bmsafemap_find(struct bmsafemap_hashhead *, int, struct bmsafemap **); static struct bmsafemap *bmsafemap_lookup(struct mount *, struct buf *, int cg, struct bmsafemap *); static int newblk_find(struct newblk_hashhead *, ufs2_daddr_t, int, struct newblk **); static int newblk_lookup(struct mount *, ufs2_daddr_t, int, struct newblk **); static int inodedep_find(struct inodedep_hashhead *, ino_t, struct inodedep **); static int inodedep_lookup(struct mount *, ino_t, int, struct inodedep **); static int pagedep_lookup(struct mount *, struct buf *bp, ino_t, ufs_lbn_t, int, struct pagedep **); static int pagedep_find(struct pagedep_hashhead *, ino_t, ufs_lbn_t, struct pagedep **); static void pause_timer(void *); static int request_cleanup(struct mount *, int); static int softdep_request_cleanup_flush(struct mount *, struct ufsmount *); static void schedule_cleanup(struct mount *); static void softdep_ast_cleanup_proc(struct thread *); static struct ufsmount *softdep_bp_to_mp(struct buf *bp); static int process_worklist_item(struct mount *, int, int); static void process_removes(struct vnode *); static void process_truncates(struct vnode *); static void jwork_move(struct workhead *, struct workhead *); static void jwork_insert(struct workhead *, struct jsegdep *); static void add_to_worklist(struct worklist *, int); static void wake_worklist(struct worklist *); static void wait_worklist(struct worklist *, char *); static void remove_from_worklist(struct worklist *); static void softdep_flush(void *); static void softdep_flushjournal(struct mount *); static int softdep_speedup(struct ufsmount *); static void worklist_speedup(struct mount *); static int journal_mount(struct mount *, struct fs *, struct ucred *); static void journal_unmount(struct ufsmount *); static int journal_space(struct ufsmount *, int); static void journal_suspend(struct ufsmount *); static int journal_unsuspend(struct ufsmount *ump); static void softdep_prelink(struct vnode *, struct vnode *); static void add_to_journal(struct worklist *); static void remove_from_journal(struct worklist *); static bool softdep_excess_items(struct ufsmount *, int); static void softdep_process_journal(struct mount *, struct worklist *, int); static struct jremref *newjremref(struct dirrem *, struct inode *, struct inode *ip, off_t, nlink_t); static struct jaddref *newjaddref(struct inode *, ino_t, off_t, int16_t, uint16_t); static inline void newinoref(struct inoref *, ino_t, ino_t, off_t, nlink_t, uint16_t); static inline struct jsegdep *inoref_jseg(struct inoref *); static struct jmvref *newjmvref(struct inode *, ino_t, off_t, off_t); static struct jfreeblk *newjfreeblk(struct freeblks *, ufs_lbn_t, ufs2_daddr_t, int); static void adjust_newfreework(struct freeblks *, int); static struct jtrunc *newjtrunc(struct freeblks *, off_t, int); static void move_newblock_dep(struct jaddref *, struct inodedep *); static void cancel_jfreeblk(struct freeblks *, ufs2_daddr_t); static struct jfreefrag *newjfreefrag(struct freefrag *, struct inode *, ufs2_daddr_t, long, ufs_lbn_t); static struct freework *newfreework(struct ufsmount *, struct freeblks *, struct freework *, ufs_lbn_t, ufs2_daddr_t, int, int, int); static int jwait(struct worklist *, int); static struct inodedep *inodedep_lookup_ip(struct inode *); static int bmsafemap_backgroundwrite(struct bmsafemap *, struct buf *); static struct freefile *handle_bufwait(struct inodedep *, struct workhead *); static void handle_jwork(struct workhead *); static struct mkdir *setup_newdir(struct diradd *, ino_t, ino_t, struct buf *, struct mkdir **); static struct jblocks *jblocks_create(void); static ufs2_daddr_t jblocks_alloc(struct jblocks *, int, int *); static void jblocks_free(struct jblocks *, struct mount *, int); static void jblocks_destroy(struct jblocks *); static void jblocks_add(struct jblocks *, ufs2_daddr_t, int); /* * Exported softdep operations. */ static void softdep_disk_io_initiation(struct buf *); static void softdep_disk_write_complete(struct buf *); static void softdep_deallocate_dependencies(struct buf *); static int softdep_count_dependencies(struct buf *bp, int); /* * Global lock over all of soft updates. */ static struct mtx lk; MTX_SYSINIT(softdep_lock, &lk, "global softdep", MTX_DEF); #define ACQUIRE_GBLLOCK(lk) mtx_lock(lk) #define FREE_GBLLOCK(lk) mtx_unlock(lk) #define GBLLOCK_OWNED(lk) mtx_assert((lk), MA_OWNED) /* * Per-filesystem soft-updates locking. */ #define LOCK_PTR(ump) (&(ump)->um_softdep->sd_fslock) #define TRY_ACQUIRE_LOCK(ump) rw_try_wlock(&(ump)->um_softdep->sd_fslock) #define ACQUIRE_LOCK(ump) rw_wlock(&(ump)->um_softdep->sd_fslock) #define FREE_LOCK(ump) rw_wunlock(&(ump)->um_softdep->sd_fslock) #define LOCK_OWNED(ump) rw_assert(&(ump)->um_softdep->sd_fslock, \ RA_WLOCKED) #define BUF_AREC(bp) lockallowrecurse(&(bp)->b_lock) #define BUF_NOREC(bp) lockdisablerecurse(&(bp)->b_lock) /* * Worklist queue management. * These routines require that the lock be held. */ #ifndef /* NOT */ INVARIANTS #define WORKLIST_INSERT(head, item) do { \ (item)->wk_state |= ONWORKLIST; \ LIST_INSERT_HEAD(head, item, wk_list); \ } while (0) #define WORKLIST_REMOVE(item) do { \ (item)->wk_state &= ~ONWORKLIST; \ LIST_REMOVE(item, wk_list); \ } while (0) #define WORKLIST_INSERT_UNLOCKED WORKLIST_INSERT #define WORKLIST_REMOVE_UNLOCKED WORKLIST_REMOVE #else /* INVARIANTS */ static void worklist_insert(struct workhead *, struct worklist *, int, const char *, int); static void worklist_remove(struct worklist *, int, const char *, int); #define WORKLIST_INSERT(head, item) \ worklist_insert(head, item, 1, __func__, __LINE__) #define WORKLIST_INSERT_UNLOCKED(head, item)\ worklist_insert(head, item, 0, __func__, __LINE__) #define WORKLIST_REMOVE(item)\ worklist_remove(item, 1, __func__, __LINE__) #define WORKLIST_REMOVE_UNLOCKED(item)\ worklist_remove(item, 0, __func__, __LINE__) static void worklist_insert(head, item, locked, func, line) struct workhead *head; struct worklist *item; int locked; const char *func; int line; { if (locked) LOCK_OWNED(VFSTOUFS(item->wk_mp)); if (item->wk_state & ONWORKLIST) panic("worklist_insert: %p %s(0x%X) already on list, " "added in function %s at line %d", item, TYPENAME(item->wk_type), item->wk_state, item->wk_func, item->wk_line); item->wk_state |= ONWORKLIST; item->wk_func = func; item->wk_line = line; LIST_INSERT_HEAD(head, item, wk_list); } static void worklist_remove(item, locked, func, line) struct worklist *item; int locked; const char *func; int line; { if (locked) LOCK_OWNED(VFSTOUFS(item->wk_mp)); if ((item->wk_state & ONWORKLIST) == 0) panic("worklist_remove: %p %s(0x%X) not on list, " "removed in function %s at line %d", item, TYPENAME(item->wk_type), item->wk_state, item->wk_func, item->wk_line); item->wk_state &= ~ONWORKLIST; item->wk_func = func; item->wk_line = line; LIST_REMOVE(item, wk_list); } #endif /* INVARIANTS */ /* * Merge two jsegdeps keeping only the oldest one as newer references * can't be discarded until after older references. */ static inline struct jsegdep * jsegdep_merge(struct jsegdep *one, struct jsegdep *two) { struct jsegdep *swp; if (two == NULL) return (one); if (one->jd_seg->js_seq > two->jd_seg->js_seq) { swp = one; one = two; two = swp; } WORKLIST_REMOVE(&two->jd_list); free_jsegdep(two); return (one); } /* * If two freedeps are compatible free one to reduce list size. */ static inline struct freedep * freedep_merge(struct freedep *one, struct freedep *two) { if (two == NULL) return (one); if (one->fd_freework == two->fd_freework) { WORKLIST_REMOVE(&two->fd_list); free_freedep(two); } return (one); } /* * Move journal work from one list to another. Duplicate freedeps and * jsegdeps are coalesced to keep the lists as small as possible. */ static void jwork_move(dst, src) struct workhead *dst; struct workhead *src; { struct freedep *freedep; struct jsegdep *jsegdep; struct worklist *wkn; struct worklist *wk; KASSERT(dst != src, ("jwork_move: dst == src")); freedep = NULL; jsegdep = NULL; LIST_FOREACH_SAFE(wk, dst, wk_list, wkn) { if (wk->wk_type == D_JSEGDEP) jsegdep = jsegdep_merge(WK_JSEGDEP(wk), jsegdep); else if (wk->wk_type == D_FREEDEP) freedep = freedep_merge(WK_FREEDEP(wk), freedep); } while ((wk = LIST_FIRST(src)) != NULL) { WORKLIST_REMOVE(wk); WORKLIST_INSERT(dst, wk); if (wk->wk_type == D_JSEGDEP) { jsegdep = jsegdep_merge(WK_JSEGDEP(wk), jsegdep); continue; } if (wk->wk_type == D_FREEDEP) freedep = freedep_merge(WK_FREEDEP(wk), freedep); } } static void jwork_insert(dst, jsegdep) struct workhead *dst; struct jsegdep *jsegdep; { struct jsegdep *jsegdepn; struct worklist *wk; LIST_FOREACH(wk, dst, wk_list) if (wk->wk_type == D_JSEGDEP) break; if (wk == NULL) { WORKLIST_INSERT(dst, &jsegdep->jd_list); return; } jsegdepn = WK_JSEGDEP(wk); if (jsegdep->jd_seg->js_seq < jsegdepn->jd_seg->js_seq) { WORKLIST_REMOVE(wk); free_jsegdep(jsegdepn); WORKLIST_INSERT(dst, &jsegdep->jd_list); } else free_jsegdep(jsegdep); } /* * Routines for tracking and managing workitems. */ static void workitem_free(struct worklist *, int); static void workitem_alloc(struct worklist *, int, struct mount *); static void workitem_reassign(struct worklist *, int); #define WORKITEM_FREE(item, type) \ workitem_free((struct worklist *)(item), (type)) #define WORKITEM_REASSIGN(item, type) \ workitem_reassign((struct worklist *)(item), (type)) static void workitem_free(item, type) struct worklist *item; int type; { struct ufsmount *ump; #ifdef INVARIANTS if (item->wk_state & ONWORKLIST) panic("workitem_free: %s(0x%X) still on list, " "added in function %s at line %d", TYPENAME(item->wk_type), item->wk_state, item->wk_func, item->wk_line); if (item->wk_type != type && type != D_NEWBLK) panic("workitem_free: type mismatch %s != %s", TYPENAME(item->wk_type), TYPENAME(type)); #endif if (item->wk_state & IOWAITING) wakeup(item); ump = VFSTOUFS(item->wk_mp); LOCK_OWNED(ump); KASSERT(ump->softdep_deps > 0, ("workitem_free: %s: softdep_deps going negative", ump->um_fs->fs_fsmnt)); if (--ump->softdep_deps == 0 && ump->softdep_req) wakeup(&ump->softdep_deps); KASSERT(dep_current[item->wk_type] > 0, ("workitem_free: %s: dep_current[%s] going negative", ump->um_fs->fs_fsmnt, TYPENAME(item->wk_type))); KASSERT(ump->softdep_curdeps[item->wk_type] > 0, ("workitem_free: %s: softdep_curdeps[%s] going negative", ump->um_fs->fs_fsmnt, TYPENAME(item->wk_type))); atomic_subtract_long(&dep_current[item->wk_type], 1); ump->softdep_curdeps[item->wk_type] -= 1; #ifdef INVARIANTS LIST_REMOVE(item, wk_all); #endif free(item, DtoM(type)); } static void workitem_alloc(item, type, mp) struct worklist *item; int type; struct mount *mp; { struct ufsmount *ump; item->wk_type = type; item->wk_mp = mp; item->wk_state = 0; ump = VFSTOUFS(mp); ACQUIRE_GBLLOCK(&lk); dep_current[type]++; if (dep_current[type] > dep_highuse[type]) dep_highuse[type] = dep_current[type]; dep_total[type]++; FREE_GBLLOCK(&lk); ACQUIRE_LOCK(ump); ump->softdep_curdeps[type] += 1; ump->softdep_deps++; ump->softdep_accdeps++; #ifdef INVARIANTS LIST_INSERT_HEAD(&ump->softdep_alldeps[type], item, wk_all); #endif FREE_LOCK(ump); } static void workitem_reassign(item, newtype) struct worklist *item; int newtype; { struct ufsmount *ump; ump = VFSTOUFS(item->wk_mp); LOCK_OWNED(ump); KASSERT(ump->softdep_curdeps[item->wk_type] > 0, ("workitem_reassign: %s: softdep_curdeps[%s] going negative", VFSTOUFS(item->wk_mp)->um_fs->fs_fsmnt, TYPENAME(item->wk_type))); ump->softdep_curdeps[item->wk_type] -= 1; ump->softdep_curdeps[newtype] += 1; KASSERT(dep_current[item->wk_type] > 0, ("workitem_reassign: %s: dep_current[%s] going negative", VFSTOUFS(item->wk_mp)->um_fs->fs_fsmnt, TYPENAME(item->wk_type))); ACQUIRE_GBLLOCK(&lk); dep_current[newtype]++; dep_current[item->wk_type]--; if (dep_current[newtype] > dep_highuse[newtype]) dep_highuse[newtype] = dep_current[newtype]; dep_total[newtype]++; FREE_GBLLOCK(&lk); item->wk_type = newtype; } /* * Workitem queue management */ static int max_softdeps; /* maximum number of structs before slowdown */ static int tickdelay = 2; /* number of ticks to pause during slowdown */ static int proc_waiting; /* tracks whether we have a timeout posted */ static int *stat_countp; /* statistic to count in proc_waiting timeout */ static struct callout softdep_callout; static int req_clear_inodedeps; /* syncer process flush some inodedeps */ static int req_clear_remove; /* syncer process flush some freeblks */ static int softdep_flushcache = 0; /* Should we do BIO_FLUSH? */ /* * runtime statistics */ static int stat_flush_threads; /* number of softdep flushing threads */ static int stat_worklist_push; /* number of worklist cleanups */ static int stat_blk_limit_push; /* number of times block limit neared */ static int stat_ino_limit_push; /* number of times inode limit neared */ static int stat_blk_limit_hit; /* number of times block slowdown imposed */ static int stat_ino_limit_hit; /* number of times inode slowdown imposed */ static int stat_sync_limit_hit; /* number of synchronous slowdowns imposed */ static int stat_indir_blk_ptrs; /* bufs redirtied as indir ptrs not written */ static int stat_inode_bitmap; /* bufs redirtied as inode bitmap not written */ static int stat_direct_blk_ptrs;/* bufs redirtied as direct ptrs not written */ static int stat_dir_entry; /* bufs redirtied as dir entry cannot write */ static int stat_jaddref; /* bufs redirtied as ino bitmap can not write */ static int stat_jnewblk; /* bufs redirtied as blk bitmap can not write */ static int stat_journal_min; /* Times hit journal min threshold */ static int stat_journal_low; /* Times hit journal low threshold */ static int stat_journal_wait; /* Times blocked in jwait(). */ static int stat_jwait_filepage; /* Times blocked in jwait() for filepage. */ static int stat_jwait_freeblks; /* Times blocked in jwait() for freeblks. */ static int stat_jwait_inode; /* Times blocked in jwait() for inodes. */ static int stat_jwait_newblk; /* Times blocked in jwait() for newblks. */ static int stat_cleanup_high_delay; /* Maximum cleanup delay (in ticks) */ static int stat_cleanup_blkrequests; /* Number of block cleanup requests */ static int stat_cleanup_inorequests; /* Number of inode cleanup requests */ static int stat_cleanup_retries; /* Number of cleanups that needed to flush */ static int stat_cleanup_failures; /* Number of cleanup requests that failed */ static int stat_emptyjblocks; /* Number of potentially empty journal blocks */ SYSCTL_INT(_debug_softdep, OID_AUTO, max_softdeps, CTLFLAG_RW, &max_softdeps, 0, ""); SYSCTL_INT(_debug_softdep, OID_AUTO, tickdelay, CTLFLAG_RW, &tickdelay, 0, ""); SYSCTL_INT(_debug_softdep, OID_AUTO, flush_threads, CTLFLAG_RD, &stat_flush_threads, 0, ""); SYSCTL_INT(_debug_softdep, OID_AUTO, worklist_push, CTLFLAG_RW | CTLFLAG_STATS, &stat_worklist_push, 0,""); SYSCTL_INT(_debug_softdep, OID_AUTO, blk_limit_push, CTLFLAG_RW | CTLFLAG_STATS, &stat_blk_limit_push, 0,""); SYSCTL_INT(_debug_softdep, OID_AUTO, ino_limit_push, CTLFLAG_RW | CTLFLAG_STATS, &stat_ino_limit_push, 0,""); SYSCTL_INT(_debug_softdep, OID_AUTO, blk_limit_hit, CTLFLAG_RW | CTLFLAG_STATS, &stat_blk_limit_hit, 0, ""); SYSCTL_INT(_debug_softdep, OID_AUTO, ino_limit_hit, CTLFLAG_RW | CTLFLAG_STATS, &stat_ino_limit_hit, 0, ""); SYSCTL_INT(_debug_softdep, OID_AUTO, sync_limit_hit, CTLFLAG_RW | CTLFLAG_STATS, &stat_sync_limit_hit, 0, ""); SYSCTL_INT(_debug_softdep, OID_AUTO, indir_blk_ptrs, CTLFLAG_RW | CTLFLAG_STATS, &stat_indir_blk_ptrs, 0, ""); SYSCTL_INT(_debug_softdep, OID_AUTO, inode_bitmap, CTLFLAG_RW | CTLFLAG_STATS, &stat_inode_bitmap, 0, ""); SYSCTL_INT(_debug_softdep, OID_AUTO, direct_blk_ptrs, CTLFLAG_RW | CTLFLAG_STATS, &stat_direct_blk_ptrs, 0, ""); SYSCTL_INT(_debug_softdep, OID_AUTO, dir_entry, CTLFLAG_RW | CTLFLAG_STATS, &stat_dir_entry, 0, ""); SYSCTL_INT(_debug_softdep, OID_AUTO, jaddref_rollback, CTLFLAG_RW | CTLFLAG_STATS, &stat_jaddref, 0, ""); SYSCTL_INT(_debug_softdep, OID_AUTO, jnewblk_rollback, CTLFLAG_RW | CTLFLAG_STATS, &stat_jnewblk, 0, ""); SYSCTL_INT(_debug_softdep, OID_AUTO, journal_low, CTLFLAG_RW | CTLFLAG_STATS, &stat_journal_low, 0, ""); SYSCTL_INT(_debug_softdep, OID_AUTO, journal_min, CTLFLAG_RW | CTLFLAG_STATS, &stat_journal_min, 0, ""); SYSCTL_INT(_debug_softdep, OID_AUTO, journal_wait, CTLFLAG_RW | CTLFLAG_STATS, &stat_journal_wait, 0, ""); SYSCTL_INT(_debug_softdep, OID_AUTO, jwait_filepage, CTLFLAG_RW | CTLFLAG_STATS, &stat_jwait_filepage, 0, ""); SYSCTL_INT(_debug_softdep, OID_AUTO, jwait_freeblks, CTLFLAG_RW | CTLFLAG_STATS, &stat_jwait_freeblks, 0, ""); SYSCTL_INT(_debug_softdep, OID_AUTO, jwait_inode, CTLFLAG_RW | CTLFLAG_STATS, &stat_jwait_inode, 0, ""); SYSCTL_INT(_debug_softdep, OID_AUTO, jwait_newblk, CTLFLAG_RW | CTLFLAG_STATS, &stat_jwait_newblk, 0, ""); SYSCTL_INT(_debug_softdep, OID_AUTO, cleanup_blkrequests, CTLFLAG_RW | CTLFLAG_STATS, &stat_cleanup_blkrequests, 0, ""); SYSCTL_INT(_debug_softdep, OID_AUTO, cleanup_inorequests, CTLFLAG_RW | CTLFLAG_STATS, &stat_cleanup_inorequests, 0, ""); SYSCTL_INT(_debug_softdep, OID_AUTO, cleanup_high_delay, CTLFLAG_RW | CTLFLAG_STATS, &stat_cleanup_high_delay, 0, ""); SYSCTL_INT(_debug_softdep, OID_AUTO, cleanup_retries, CTLFLAG_RW | CTLFLAG_STATS, &stat_cleanup_retries, 0, ""); SYSCTL_INT(_debug_softdep, OID_AUTO, cleanup_failures, CTLFLAG_RW | CTLFLAG_STATS, &stat_cleanup_failures, 0, ""); SYSCTL_INT(_debug_softdep, OID_AUTO, flushcache, CTLFLAG_RW, &softdep_flushcache, 0, ""); SYSCTL_INT(_debug_softdep, OID_AUTO, emptyjblocks, CTLFLAG_RD, &stat_emptyjblocks, 0, ""); SYSCTL_DECL(_vfs_ffs); /* Whether to recompute the summary at mount time */ static int compute_summary_at_mount = 0; SYSCTL_INT(_vfs_ffs, OID_AUTO, compute_summary_at_mount, CTLFLAG_RW, &compute_summary_at_mount, 0, "Recompute summary at mount"); static int print_threads = 0; SYSCTL_INT(_debug_softdep, OID_AUTO, print_threads, CTLFLAG_RW, &print_threads, 0, "Notify flusher thread start/stop"); /* List of all filesystems mounted with soft updates */ static TAILQ_HEAD(, mount_softdeps) softdepmounts; /* * This function cleans the worklist for a filesystem. * Each filesystem running with soft dependencies gets its own * thread to run in this function. The thread is started up in * softdep_mount and shutdown in softdep_unmount. They show up * as part of the kernel "bufdaemon" process whose process * entry is available in bufdaemonproc. */ static int searchfailed; extern struct proc *bufdaemonproc; static void softdep_flush(addr) void *addr; { struct mount *mp; struct thread *td; struct ufsmount *ump; td = curthread; td->td_pflags |= TDP_NORUNNINGBUF; mp = (struct mount *)addr; ump = VFSTOUFS(mp); atomic_add_int(&stat_flush_threads, 1); ACQUIRE_LOCK(ump); ump->softdep_flags &= ~FLUSH_STARTING; wakeup(&ump->softdep_flushtd); FREE_LOCK(ump); if (print_threads) { if (stat_flush_threads == 1) printf("Running %s at pid %d\n", bufdaemonproc->p_comm, bufdaemonproc->p_pid); printf("Start thread %s\n", td->td_name); } for (;;) { while (softdep_process_worklist(mp, 0) > 0 || (MOUNTEDSUJ(mp) && VFSTOUFS(mp)->softdep_jblocks->jb_suspended)) kthread_suspend_check(); ACQUIRE_LOCK(ump); if ((ump->softdep_flags & (FLUSH_CLEANUP | FLUSH_EXIT)) == 0) msleep(&ump->softdep_flushtd, LOCK_PTR(ump), PVM, "sdflush", hz / 2); ump->softdep_flags &= ~FLUSH_CLEANUP; /* * Check to see if we are done and need to exit. */ if ((ump->softdep_flags & FLUSH_EXIT) == 0) { FREE_LOCK(ump); continue; } ump->softdep_flags &= ~FLUSH_EXIT; FREE_LOCK(ump); wakeup(&ump->softdep_flags); if (print_threads) printf("Stop thread %s: searchfailed %d, did cleanups %d\n", td->td_name, searchfailed, ump->um_softdep->sd_cleanups); atomic_subtract_int(&stat_flush_threads, 1); kthread_exit(); panic("kthread_exit failed\n"); } } static void worklist_speedup(mp) struct mount *mp; { struct ufsmount *ump; ump = VFSTOUFS(mp); LOCK_OWNED(ump); if ((ump->softdep_flags & (FLUSH_CLEANUP | FLUSH_EXIT)) == 0) ump->softdep_flags |= FLUSH_CLEANUP; wakeup(&ump->softdep_flushtd); } static void softdep_send_speedup(struct ufsmount *ump, size_t shortage, u_int flags) { struct buf *bp; if ((ump->um_flags & UM_CANSPEEDUP) == 0) return; bp = malloc(sizeof(*bp), M_TRIM, M_WAITOK | M_ZERO); bp->b_iocmd = BIO_SPEEDUP; bp->b_ioflags = flags; bp->b_bcount = shortage; g_vfs_strategy(ump->um_bo, bp); bufwait(bp); free(bp, M_TRIM); } static int softdep_speedup(ump) struct ufsmount *ump; { struct ufsmount *altump; struct mount_softdeps *sdp; LOCK_OWNED(ump); worklist_speedup(ump->um_mountp); bd_speedup(); /* * If we have global shortages, then we need other * filesystems to help with the cleanup. Here we wakeup a * flusher thread for a filesystem that is over its fair * share of resources. */ if (req_clear_inodedeps || req_clear_remove) { ACQUIRE_GBLLOCK(&lk); TAILQ_FOREACH(sdp, &softdepmounts, sd_next) { if ((altump = sdp->sd_ump) == ump) continue; if (((req_clear_inodedeps && altump->softdep_curdeps[D_INODEDEP] > max_softdeps / stat_flush_threads) || (req_clear_remove && altump->softdep_curdeps[D_DIRREM] > (max_softdeps / 2) / stat_flush_threads)) && TRY_ACQUIRE_LOCK(altump)) break; } if (sdp == NULL) { searchfailed++; FREE_GBLLOCK(&lk); } else { /* * Move to the end of the list so we pick a * different one on out next try. */ TAILQ_REMOVE(&softdepmounts, sdp, sd_next); TAILQ_INSERT_TAIL(&softdepmounts, sdp, sd_next); FREE_GBLLOCK(&lk); if ((altump->softdep_flags & (FLUSH_CLEANUP | FLUSH_EXIT)) == 0) altump->softdep_flags |= FLUSH_CLEANUP; altump->um_softdep->sd_cleanups++; wakeup(&altump->softdep_flushtd); FREE_LOCK(altump); } } return (speedup_syncer()); } /* * Add an item to the end of the work queue. * This routine requires that the lock be held. * This is the only routine that adds items to the list. * The following routine is the only one that removes items * and does so in order from first to last. */ #define WK_HEAD 0x0001 /* Add to HEAD. */ #define WK_NODELAY 0x0002 /* Process immediately. */ static void add_to_worklist(wk, flags) struct worklist *wk; int flags; { struct ufsmount *ump; ump = VFSTOUFS(wk->wk_mp); LOCK_OWNED(ump); if (wk->wk_state & ONWORKLIST) panic("add_to_worklist: %s(0x%X) already on list", TYPENAME(wk->wk_type), wk->wk_state); wk->wk_state |= ONWORKLIST; if (ump->softdep_on_worklist == 0) { LIST_INSERT_HEAD(&ump->softdep_workitem_pending, wk, wk_list); ump->softdep_worklist_tail = wk; } else if (flags & WK_HEAD) { LIST_INSERT_HEAD(&ump->softdep_workitem_pending, wk, wk_list); } else { LIST_INSERT_AFTER(ump->softdep_worklist_tail, wk, wk_list); ump->softdep_worklist_tail = wk; } ump->softdep_on_worklist += 1; if (flags & WK_NODELAY) worklist_speedup(wk->wk_mp); } /* * Remove the item to be processed. If we are removing the last * item on the list, we need to recalculate the tail pointer. */ static void remove_from_worklist(wk) struct worklist *wk; { struct ufsmount *ump; ump = VFSTOUFS(wk->wk_mp); if (ump->softdep_worklist_tail == wk) ump->softdep_worklist_tail = (struct worklist *)wk->wk_list.le_prev; WORKLIST_REMOVE(wk); ump->softdep_on_worklist -= 1; } static void wake_worklist(wk) struct worklist *wk; { if (wk->wk_state & IOWAITING) { wk->wk_state &= ~IOWAITING; wakeup(wk); } } static void wait_worklist(wk, wmesg) struct worklist *wk; char *wmesg; { struct ufsmount *ump; ump = VFSTOUFS(wk->wk_mp); wk->wk_state |= IOWAITING; msleep(wk, LOCK_PTR(ump), PVM, wmesg, 0); } /* * Process that runs once per second to handle items in the background queue. * * Note that we ensure that everything is done in the order in which they * appear in the queue. The code below depends on this property to ensure * that blocks of a file are freed before the inode itself is freed. This * ordering ensures that no new triples will be generated * until all the old ones have been purged from the dependency lists. */ static int softdep_process_worklist(mp, full) struct mount *mp; int full; { int cnt, matchcnt; struct ufsmount *ump; long starttime; KASSERT(mp != NULL, ("softdep_process_worklist: NULL mp")); if (MOUNTEDSOFTDEP(mp) == 0) return (0); matchcnt = 0; ump = VFSTOUFS(mp); ACQUIRE_LOCK(ump); starttime = time_second; softdep_process_journal(mp, NULL, full ? MNT_WAIT : 0); check_clear_deps(mp); while (ump->softdep_on_worklist > 0) { if ((cnt = process_worklist_item(mp, 10, LK_NOWAIT)) == 0) break; else matchcnt += cnt; check_clear_deps(mp); /* * We do not generally want to stop for buffer space, but if * we are really being a buffer hog, we will stop and wait. */ if (should_yield()) { FREE_LOCK(ump); kern_yield(PRI_USER); bwillwrite(); ACQUIRE_LOCK(ump); } /* * Never allow processing to run for more than one * second. This gives the syncer thread the opportunity * to pause if appropriate. */ if (!full && starttime != time_second) break; } if (full == 0) journal_unsuspend(ump); FREE_LOCK(ump); return (matchcnt); } /* * Process all removes associated with a vnode if we are running out of * journal space. Any other process which attempts to flush these will * be unable as we have the vnodes locked. */ static void process_removes(vp) struct vnode *vp; { struct inodedep *inodedep; struct dirrem *dirrem; struct ufsmount *ump; struct mount *mp; ino_t inum; mp = vp->v_mount; ump = VFSTOUFS(mp); LOCK_OWNED(ump); inum = VTOI(vp)->i_number; for (;;) { top: if (inodedep_lookup(mp, inum, 0, &inodedep) == 0) return; LIST_FOREACH(dirrem, &inodedep->id_dirremhd, dm_inonext) { /* * If another thread is trying to lock this vnode * it will fail but we must wait for it to do so * before we can proceed. */ if (dirrem->dm_state & INPROGRESS) { wait_worklist(&dirrem->dm_list, "pwrwait"); goto top; } if ((dirrem->dm_state & (COMPLETE | ONWORKLIST)) == (COMPLETE | ONWORKLIST)) break; } if (dirrem == NULL) return; remove_from_worklist(&dirrem->dm_list); FREE_LOCK(ump); if (vn_start_secondary_write(NULL, &mp, V_NOWAIT)) panic("process_removes: suspended filesystem"); handle_workitem_remove(dirrem, 0); vn_finished_secondary_write(mp); ACQUIRE_LOCK(ump); } } /* * Process all truncations associated with a vnode if we are running out * of journal space. This is called when the vnode lock is already held * and no other process can clear the truncation. This function returns * a value greater than zero if it did any work. */ static void process_truncates(vp) struct vnode *vp; { struct inodedep *inodedep; struct freeblks *freeblks; struct ufsmount *ump; struct mount *mp; ino_t inum; int cgwait; mp = vp->v_mount; ump = VFSTOUFS(mp); LOCK_OWNED(ump); inum = VTOI(vp)->i_number; for (;;) { if (inodedep_lookup(mp, inum, 0, &inodedep) == 0) return; cgwait = 0; TAILQ_FOREACH(freeblks, &inodedep->id_freeblklst, fb_next) { /* Journal entries not yet written. */ if (!LIST_EMPTY(&freeblks->fb_jblkdephd)) { jwait(&LIST_FIRST( &freeblks->fb_jblkdephd)->jb_list, MNT_WAIT); break; } /* Another thread is executing this item. */ if (freeblks->fb_state & INPROGRESS) { wait_worklist(&freeblks->fb_list, "ptrwait"); break; } /* Freeblks is waiting on a inode write. */ if ((freeblks->fb_state & COMPLETE) == 0) { FREE_LOCK(ump); ffs_update(vp, 1); ACQUIRE_LOCK(ump); break; } if ((freeblks->fb_state & (ALLCOMPLETE | ONWORKLIST)) == (ALLCOMPLETE | ONWORKLIST)) { remove_from_worklist(&freeblks->fb_list); freeblks->fb_state |= INPROGRESS; FREE_LOCK(ump); if (vn_start_secondary_write(NULL, &mp, V_NOWAIT)) panic("process_truncates: " "suspended filesystem"); handle_workitem_freeblocks(freeblks, 0); vn_finished_secondary_write(mp); ACQUIRE_LOCK(ump); break; } if (freeblks->fb_cgwait) cgwait++; } if (cgwait) { FREE_LOCK(ump); sync_cgs(mp, MNT_WAIT); ffs_sync_snap(mp, MNT_WAIT); ACQUIRE_LOCK(ump); continue; } if (freeblks == NULL) break; } return; } /* * Process one item on the worklist. */ static int process_worklist_item(mp, target, flags) struct mount *mp; int target; int flags; { struct worklist sentinel; struct worklist *wk; struct ufsmount *ump; int matchcnt; int error; KASSERT(mp != NULL, ("process_worklist_item: NULL mp")); /* * If we are being called because of a process doing a * copy-on-write, then it is not safe to write as we may * recurse into the copy-on-write routine. */ if (curthread->td_pflags & TDP_COWINPROGRESS) return (-1); PHOLD(curproc); /* Don't let the stack go away. */ ump = VFSTOUFS(mp); LOCK_OWNED(ump); matchcnt = 0; sentinel.wk_mp = NULL; sentinel.wk_type = D_SENTINEL; LIST_INSERT_HEAD(&ump->softdep_workitem_pending, &sentinel, wk_list); for (wk = LIST_NEXT(&sentinel, wk_list); wk != NULL; wk = LIST_NEXT(&sentinel, wk_list)) { if (wk->wk_type == D_SENTINEL) { LIST_REMOVE(&sentinel, wk_list); LIST_INSERT_AFTER(wk, &sentinel, wk_list); continue; } if (wk->wk_state & INPROGRESS) panic("process_worklist_item: %p already in progress.", wk); wk->wk_state |= INPROGRESS; remove_from_worklist(wk); FREE_LOCK(ump); if (vn_start_secondary_write(NULL, &mp, V_NOWAIT)) panic("process_worklist_item: suspended filesystem"); switch (wk->wk_type) { case D_DIRREM: /* removal of a directory entry */ error = handle_workitem_remove(WK_DIRREM(wk), flags); break; case D_FREEBLKS: /* releasing blocks and/or fragments from a file */ error = handle_workitem_freeblocks(WK_FREEBLKS(wk), flags); break; case D_FREEFRAG: /* releasing a fragment when replaced as a file grows */ handle_workitem_freefrag(WK_FREEFRAG(wk)); error = 0; break; case D_FREEFILE: /* releasing an inode when its link count drops to 0 */ handle_workitem_freefile(WK_FREEFILE(wk)); error = 0; break; default: panic("%s_process_worklist: Unknown type %s", "softdep", TYPENAME(wk->wk_type)); /* NOTREACHED */ } vn_finished_secondary_write(mp); ACQUIRE_LOCK(ump); if (error == 0) { if (++matchcnt == target) break; continue; } /* * We have to retry the worklist item later. Wake up any * waiters who may be able to complete it immediately and * add the item back to the head so we don't try to execute * it again. */ wk->wk_state &= ~INPROGRESS; wake_worklist(wk); add_to_worklist(wk, WK_HEAD); } /* Sentinal could've become the tail from remove_from_worklist. */ if (ump->softdep_worklist_tail == &sentinel) ump->softdep_worklist_tail = (struct worklist *)sentinel.wk_list.le_prev; LIST_REMOVE(&sentinel, wk_list); PRELE(curproc); return (matchcnt); } /* * Move dependencies from one buffer to another. */ int softdep_move_dependencies(oldbp, newbp) struct buf *oldbp; struct buf *newbp; { struct worklist *wk, *wktail; struct ufsmount *ump; int dirty; if ((wk = LIST_FIRST(&oldbp->b_dep)) == NULL) return (0); KASSERT(MOUNTEDSOFTDEP(wk->wk_mp) != 0, ("softdep_move_dependencies called on non-softdep filesystem")); dirty = 0; wktail = NULL; ump = VFSTOUFS(wk->wk_mp); ACQUIRE_LOCK(ump); while ((wk = LIST_FIRST(&oldbp->b_dep)) != NULL) { LIST_REMOVE(wk, wk_list); if (wk->wk_type == D_BMSAFEMAP && bmsafemap_backgroundwrite(WK_BMSAFEMAP(wk), newbp)) dirty = 1; if (wktail == NULL) LIST_INSERT_HEAD(&newbp->b_dep, wk, wk_list); else LIST_INSERT_AFTER(wktail, wk, wk_list); wktail = wk; } FREE_LOCK(ump); return (dirty); } /* * Purge the work list of all items associated with a particular mount point. */ int softdep_flushworklist(oldmnt, countp, td) struct mount *oldmnt; int *countp; struct thread *td; { struct vnode *devvp; struct ufsmount *ump; int count, error; /* * Alternately flush the block device associated with the mount * point and process any dependencies that the flushing * creates. We continue until no more worklist dependencies * are found. */ *countp = 0; error = 0; ump = VFSTOUFS(oldmnt); devvp = ump->um_devvp; while ((count = softdep_process_worklist(oldmnt, 1)) > 0) { *countp += count; vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY); error = VOP_FSYNC(devvp, MNT_WAIT, td); VOP_UNLOCK(devvp); if (error != 0) break; } return (error); } #define SU_WAITIDLE_RETRIES 20 static int softdep_waitidle(struct mount *mp, int flags __unused) { struct ufsmount *ump; struct vnode *devvp; struct thread *td; int error, i; ump = VFSTOUFS(mp); devvp = ump->um_devvp; td = curthread; error = 0; ACQUIRE_LOCK(ump); for (i = 0; i < SU_WAITIDLE_RETRIES && ump->softdep_deps != 0; i++) { ump->softdep_req = 1; KASSERT((flags & FORCECLOSE) == 0 || ump->softdep_on_worklist == 0, ("softdep_waitidle: work added after flush")); msleep(&ump->softdep_deps, LOCK_PTR(ump), PVM | PDROP, "softdeps", 10 * hz); vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY); error = VOP_FSYNC(devvp, MNT_WAIT, td); VOP_UNLOCK(devvp); ACQUIRE_LOCK(ump); if (error != 0) break; } ump->softdep_req = 0; if (i == SU_WAITIDLE_RETRIES && error == 0 && ump->softdep_deps != 0) { error = EBUSY; printf("softdep_waitidle: Failed to flush worklist for %p\n", mp); } FREE_LOCK(ump); return (error); } /* * Flush all vnodes and worklist items associated with a specified mount point. */ int softdep_flushfiles(oldmnt, flags, td) struct mount *oldmnt; int flags; struct thread *td; { #ifdef QUOTA struct ufsmount *ump; int i; #endif int error, early, depcount, loopcnt, retry_flush_count, retry; int morework; KASSERT(MOUNTEDSOFTDEP(oldmnt) != 0, ("softdep_flushfiles called on non-softdep filesystem")); loopcnt = 10; retry_flush_count = 3; retry_flush: error = 0; /* * Alternately flush the vnodes associated with the mount * point and process any dependencies that the flushing * creates. In theory, this loop can happen at most twice, * but we give it a few extra just to be sure. */ for (; loopcnt > 0; loopcnt--) { /* * Do another flush in case any vnodes were brought in * as part of the cleanup operations. */ early = retry_flush_count == 1 || (oldmnt->mnt_kern_flag & MNTK_UNMOUNT) == 0 ? 0 : EARLYFLUSH; if ((error = ffs_flushfiles(oldmnt, flags | early, td)) != 0) break; if ((error = softdep_flushworklist(oldmnt, &depcount, td)) != 0 || depcount == 0) break; } /* * If we are unmounting then it is an error to fail. If we * are simply trying to downgrade to read-only, then filesystem * activity can keep us busy forever, so we just fail with EBUSY. */ if (loopcnt == 0) { if (oldmnt->mnt_kern_flag & MNTK_UNMOUNT) panic("softdep_flushfiles: looping"); error = EBUSY; } if (!error) error = softdep_waitidle(oldmnt, flags); if (!error) { if (oldmnt->mnt_kern_flag & MNTK_UNMOUNT) { retry = 0; MNT_ILOCK(oldmnt); morework = oldmnt->mnt_nvnodelistsize > 0; #ifdef QUOTA ump = VFSTOUFS(oldmnt); UFS_LOCK(ump); for (i = 0; i < MAXQUOTAS; i++) { if (ump->um_quotas[i] != NULLVP) morework = 1; } UFS_UNLOCK(ump); #endif if (morework) { if (--retry_flush_count > 0) { retry = 1; loopcnt = 3; } else error = EBUSY; } MNT_IUNLOCK(oldmnt); if (retry) goto retry_flush; } } return (error); } /* * Structure hashing. * * There are four types of structures that can be looked up: * 1) pagedep structures identified by mount point, inode number, * and logical block. * 2) inodedep structures identified by mount point and inode number. * 3) newblk structures identified by mount point and * physical block number. * 4) bmsafemap structures identified by mount point and * cylinder group number. * * The "pagedep" and "inodedep" dependency structures are hashed * separately from the file blocks and inodes to which they correspond. * This separation helps when the in-memory copy of an inode or * file block must be replaced. It also obviates the need to access * an inode or file page when simply updating (or de-allocating) * dependency structures. Lookup of newblk structures is needed to * find newly allocated blocks when trying to associate them with * their allocdirect or allocindir structure. * * The lookup routines optionally create and hash a new instance when * an existing entry is not found. The bmsafemap lookup routine always * allocates a new structure if an existing one is not found. */ #define DEPALLOC 0x0001 /* allocate structure if lookup fails */ /* * Structures and routines associated with pagedep caching. */ #define PAGEDEP_HASH(ump, inum, lbn) \ (&(ump)->pagedep_hashtbl[((inum) + (lbn)) & (ump)->pagedep_hash_size]) static int pagedep_find(pagedephd, ino, lbn, pagedeppp) struct pagedep_hashhead *pagedephd; ino_t ino; ufs_lbn_t lbn; struct pagedep **pagedeppp; { struct pagedep *pagedep; LIST_FOREACH(pagedep, pagedephd, pd_hash) { if (ino == pagedep->pd_ino && lbn == pagedep->pd_lbn) { *pagedeppp = pagedep; return (1); } } *pagedeppp = NULL; return (0); } /* * Look up a pagedep. Return 1 if found, 0 otherwise. * If not found, allocate if DEPALLOC flag is passed. * Found or allocated entry is returned in pagedeppp. */ static int pagedep_lookup(mp, bp, ino, lbn, flags, pagedeppp) struct mount *mp; struct buf *bp; ino_t ino; ufs_lbn_t lbn; int flags; struct pagedep **pagedeppp; { struct pagedep *pagedep; struct pagedep_hashhead *pagedephd; struct worklist *wk; struct ufsmount *ump; int ret; int i; ump = VFSTOUFS(mp); LOCK_OWNED(ump); if (bp) { LIST_FOREACH(wk, &bp->b_dep, wk_list) { if (wk->wk_type == D_PAGEDEP) { *pagedeppp = WK_PAGEDEP(wk); return (1); } } } pagedephd = PAGEDEP_HASH(ump, ino, lbn); ret = pagedep_find(pagedephd, ino, lbn, pagedeppp); if (ret) { if (((*pagedeppp)->pd_state & ONWORKLIST) == 0 && bp) WORKLIST_INSERT(&bp->b_dep, &(*pagedeppp)->pd_list); return (1); } if ((flags & DEPALLOC) == 0) return (0); FREE_LOCK(ump); pagedep = malloc(sizeof(struct pagedep), M_PAGEDEP, M_SOFTDEP_FLAGS|M_ZERO); workitem_alloc(&pagedep->pd_list, D_PAGEDEP, mp); ACQUIRE_LOCK(ump); ret = pagedep_find(pagedephd, ino, lbn, pagedeppp); if (*pagedeppp) { /* * This should never happen since we only create pagedeps * with the vnode lock held. Could be an assert. */ WORKITEM_FREE(pagedep, D_PAGEDEP); return (ret); } pagedep->pd_ino = ino; pagedep->pd_lbn = lbn; LIST_INIT(&pagedep->pd_dirremhd); LIST_INIT(&pagedep->pd_pendinghd); for (i = 0; i < DAHASHSZ; i++) LIST_INIT(&pagedep->pd_diraddhd[i]); LIST_INSERT_HEAD(pagedephd, pagedep, pd_hash); WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list); *pagedeppp = pagedep; return (0); } /* * Structures and routines associated with inodedep caching. */ #define INODEDEP_HASH(ump, inum) \ (&(ump)->inodedep_hashtbl[(inum) & (ump)->inodedep_hash_size]) static int inodedep_find(inodedephd, inum, inodedeppp) struct inodedep_hashhead *inodedephd; ino_t inum; struct inodedep **inodedeppp; { struct inodedep *inodedep; LIST_FOREACH(inodedep, inodedephd, id_hash) if (inum == inodedep->id_ino) break; if (inodedep) { *inodedeppp = inodedep; return (1); } *inodedeppp = NULL; return (0); } /* * Look up an inodedep. Return 1 if found, 0 if not found. * If not found, allocate if DEPALLOC flag is passed. * Found or allocated entry is returned in inodedeppp. */ static int inodedep_lookup(mp, inum, flags, inodedeppp) struct mount *mp; ino_t inum; int flags; struct inodedep **inodedeppp; { struct inodedep *inodedep; struct inodedep_hashhead *inodedephd; struct ufsmount *ump; struct fs *fs; ump = VFSTOUFS(mp); LOCK_OWNED(ump); fs = ump->um_fs; inodedephd = INODEDEP_HASH(ump, inum); if (inodedep_find(inodedephd, inum, inodedeppp)) return (1); if ((flags & DEPALLOC) == 0) return (0); /* * If the system is over its limit and our filesystem is * responsible for more than our share of that usage and * we are not in a rush, request some inodedep cleanup. */ if (softdep_excess_items(ump, D_INODEDEP)) schedule_cleanup(mp); else FREE_LOCK(ump); inodedep = malloc(sizeof(struct inodedep), M_INODEDEP, M_SOFTDEP_FLAGS); workitem_alloc(&inodedep->id_list, D_INODEDEP, mp); ACQUIRE_LOCK(ump); if (inodedep_find(inodedephd, inum, inodedeppp)) { WORKITEM_FREE(inodedep, D_INODEDEP); return (1); } inodedep->id_fs = fs; inodedep->id_ino = inum; inodedep->id_state = ALLCOMPLETE; inodedep->id_nlinkdelta = 0; inodedep->id_nlinkwrote = -1; inodedep->id_savedino1 = NULL; inodedep->id_savedsize = -1; inodedep->id_savedextsize = -1; inodedep->id_savednlink = -1; inodedep->id_bmsafemap = NULL; inodedep->id_mkdiradd = NULL; LIST_INIT(&inodedep->id_dirremhd); LIST_INIT(&inodedep->id_pendinghd); LIST_INIT(&inodedep->id_inowait); LIST_INIT(&inodedep->id_bufwait); TAILQ_INIT(&inodedep->id_inoreflst); TAILQ_INIT(&inodedep->id_inoupdt); TAILQ_INIT(&inodedep->id_newinoupdt); TAILQ_INIT(&inodedep->id_extupdt); TAILQ_INIT(&inodedep->id_newextupdt); TAILQ_INIT(&inodedep->id_freeblklst); LIST_INSERT_HEAD(inodedephd, inodedep, id_hash); *inodedeppp = inodedep; return (0); } /* * Structures and routines associated with newblk caching. */ #define NEWBLK_HASH(ump, inum) \ (&(ump)->newblk_hashtbl[(inum) & (ump)->newblk_hash_size]) static int newblk_find(newblkhd, newblkno, flags, newblkpp) struct newblk_hashhead *newblkhd; ufs2_daddr_t newblkno; int flags; struct newblk **newblkpp; { struct newblk *newblk; LIST_FOREACH(newblk, newblkhd, nb_hash) { if (newblkno != newblk->nb_newblkno) continue; /* * If we're creating a new dependency don't match those that * have already been converted to allocdirects. This is for * a frag extend. */ if ((flags & DEPALLOC) && newblk->nb_list.wk_type != D_NEWBLK) continue; break; } if (newblk) { *newblkpp = newblk; return (1); } *newblkpp = NULL; return (0); } /* * Look up a newblk. Return 1 if found, 0 if not found. * If not found, allocate if DEPALLOC flag is passed. * Found or allocated entry is returned in newblkpp. */ static int newblk_lookup(mp, newblkno, flags, newblkpp) struct mount *mp; ufs2_daddr_t newblkno; int flags; struct newblk **newblkpp; { struct newblk *newblk; struct newblk_hashhead *newblkhd; struct ufsmount *ump; ump = VFSTOUFS(mp); LOCK_OWNED(ump); newblkhd = NEWBLK_HASH(ump, newblkno); if (newblk_find(newblkhd, newblkno, flags, newblkpp)) return (1); if ((flags & DEPALLOC) == 0) return (0); if (softdep_excess_items(ump, D_NEWBLK) || softdep_excess_items(ump, D_ALLOCDIRECT) || softdep_excess_items(ump, D_ALLOCINDIR)) schedule_cleanup(mp); else FREE_LOCK(ump); newblk = malloc(sizeof(union allblk), M_NEWBLK, M_SOFTDEP_FLAGS | M_ZERO); workitem_alloc(&newblk->nb_list, D_NEWBLK, mp); ACQUIRE_LOCK(ump); if (newblk_find(newblkhd, newblkno, flags, newblkpp)) { WORKITEM_FREE(newblk, D_NEWBLK); return (1); } newblk->nb_freefrag = NULL; LIST_INIT(&newblk->nb_indirdeps); LIST_INIT(&newblk->nb_newdirblk); LIST_INIT(&newblk->nb_jwork); newblk->nb_state = ATTACHED; newblk->nb_newblkno = newblkno; LIST_INSERT_HEAD(newblkhd, newblk, nb_hash); *newblkpp = newblk; return (0); } /* * Structures and routines associated with freed indirect block caching. */ #define INDIR_HASH(ump, blkno) \ (&(ump)->indir_hashtbl[(blkno) & (ump)->indir_hash_size]) /* * Lookup an indirect block in the indir hash table. The freework is * removed and potentially freed. The caller must do a blocking journal * write before writing to the blkno. */ static int indirblk_lookup(mp, blkno) struct mount *mp; ufs2_daddr_t blkno; { struct freework *freework; struct indir_hashhead *wkhd; struct ufsmount *ump; ump = VFSTOUFS(mp); wkhd = INDIR_HASH(ump, blkno); TAILQ_FOREACH(freework, wkhd, fw_next) { if (freework->fw_blkno != blkno) continue; indirblk_remove(freework); return (1); } return (0); } /* * Insert an indirect block represented by freework into the indirblk * hash table so that it may prevent the block from being re-used prior * to the journal being written. */ static void indirblk_insert(freework) struct freework *freework; { struct jblocks *jblocks; struct jseg *jseg; struct ufsmount *ump; ump = VFSTOUFS(freework->fw_list.wk_mp); jblocks = ump->softdep_jblocks; jseg = TAILQ_LAST(&jblocks->jb_segs, jseglst); if (jseg == NULL) return; LIST_INSERT_HEAD(&jseg->js_indirs, freework, fw_segs); TAILQ_INSERT_HEAD(INDIR_HASH(ump, freework->fw_blkno), freework, fw_next); freework->fw_state &= ~DEPCOMPLETE; } static void indirblk_remove(freework) struct freework *freework; { struct ufsmount *ump; ump = VFSTOUFS(freework->fw_list.wk_mp); LIST_REMOVE(freework, fw_segs); TAILQ_REMOVE(INDIR_HASH(ump, freework->fw_blkno), freework, fw_next); freework->fw_state |= DEPCOMPLETE; if ((freework->fw_state & ALLCOMPLETE) == ALLCOMPLETE) WORKITEM_FREE(freework, D_FREEWORK); } /* * Executed during filesystem system initialization before * mounting any filesystems. */ void softdep_initialize() { TAILQ_INIT(&softdepmounts); #ifdef __LP64__ max_softdeps = desiredvnodes * 4; #else max_softdeps = desiredvnodes * 2; #endif /* initialise bioops hack */ bioops.io_start = softdep_disk_io_initiation; bioops.io_complete = softdep_disk_write_complete; bioops.io_deallocate = softdep_deallocate_dependencies; bioops.io_countdeps = softdep_count_dependencies; softdep_ast_cleanup = softdep_ast_cleanup_proc; /* Initialize the callout with an mtx. */ callout_init_mtx(&softdep_callout, &lk, 0); } /* * Executed after all filesystems have been unmounted during * filesystem module unload. */ void softdep_uninitialize() { /* clear bioops hack */ bioops.io_start = NULL; bioops.io_complete = NULL; bioops.io_deallocate = NULL; bioops.io_countdeps = NULL; softdep_ast_cleanup = NULL; callout_drain(&softdep_callout); } /* * Called at mount time to notify the dependency code that a * filesystem wishes to use it. */ int softdep_mount(devvp, mp, fs, cred) struct vnode *devvp; struct mount *mp; struct fs *fs; struct ucred *cred; { struct csum_total cstotal; struct mount_softdeps *sdp; struct ufsmount *ump; struct cg *cgp; struct buf *bp; u_int cyl, i; int error; sdp = malloc(sizeof(struct mount_softdeps), M_MOUNTDATA, M_WAITOK | M_ZERO); MNT_ILOCK(mp); mp->mnt_flag = (mp->mnt_flag & ~MNT_ASYNC) | MNT_SOFTDEP; if ((mp->mnt_kern_flag & MNTK_SOFTDEP) == 0) { mp->mnt_kern_flag = (mp->mnt_kern_flag & ~MNTK_ASYNC) | MNTK_SOFTDEP | MNTK_NOASYNC; } ump = VFSTOUFS(mp); ump->um_softdep = sdp; MNT_IUNLOCK(mp); rw_init(LOCK_PTR(ump), "per-fs softdep"); sdp->sd_ump = ump; LIST_INIT(&ump->softdep_workitem_pending); LIST_INIT(&ump->softdep_journal_pending); TAILQ_INIT(&ump->softdep_unlinked); LIST_INIT(&ump->softdep_dirtycg); ump->softdep_worklist_tail = NULL; ump->softdep_on_worklist = 0; ump->softdep_deps = 0; LIST_INIT(&ump->softdep_mkdirlisthd); ump->pagedep_hashtbl = hashinit(desiredvnodes / 5, M_PAGEDEP, &ump->pagedep_hash_size); ump->pagedep_nextclean = 0; ump->inodedep_hashtbl = hashinit(desiredvnodes, M_INODEDEP, &ump->inodedep_hash_size); ump->inodedep_nextclean = 0; ump->newblk_hashtbl = hashinit(max_softdeps / 2, M_NEWBLK, &ump->newblk_hash_size); ump->bmsafemap_hashtbl = hashinit(1024, M_BMSAFEMAP, &ump->bmsafemap_hash_size); i = 1 << (ffs(desiredvnodes / 10) - 1); ump->indir_hashtbl = malloc(i * sizeof(struct indir_hashhead), M_FREEWORK, M_WAITOK); ump->indir_hash_size = i - 1; for (i = 0; i <= ump->indir_hash_size; i++) TAILQ_INIT(&ump->indir_hashtbl[i]); #ifdef INVARIANTS for (i = 0; i <= D_LAST; i++) LIST_INIT(&ump->softdep_alldeps[i]); #endif ACQUIRE_GBLLOCK(&lk); TAILQ_INSERT_TAIL(&softdepmounts, sdp, sd_next); FREE_GBLLOCK(&lk); if ((fs->fs_flags & FS_SUJ) && (error = journal_mount(mp, fs, cred)) != 0) { printf("Failed to start journal: %d\n", error); softdep_unmount(mp); return (error); } /* * Start our flushing thread in the bufdaemon process. */ ACQUIRE_LOCK(ump); ump->softdep_flags |= FLUSH_STARTING; FREE_LOCK(ump); kproc_kthread_add(&softdep_flush, mp, &bufdaemonproc, &ump->softdep_flushtd, 0, 0, "softdepflush", "%s worker", mp->mnt_stat.f_mntonname); ACQUIRE_LOCK(ump); while ((ump->softdep_flags & FLUSH_STARTING) != 0) { msleep(&ump->softdep_flushtd, LOCK_PTR(ump), PVM, "sdstart", hz / 2); } FREE_LOCK(ump); /* * When doing soft updates, the counters in the * superblock may have gotten out of sync. Recomputation * can take a long time and can be deferred for background * fsck. However, the old behavior of scanning the cylinder * groups and recalculating them at mount time is available * by setting vfs.ffs.compute_summary_at_mount to one. */ if (compute_summary_at_mount == 0 || fs->fs_clean != 0) return (0); bzero(&cstotal, sizeof cstotal); for (cyl = 0; cyl < fs->fs_ncg; cyl++) { if ((error = bread(devvp, fsbtodb(fs, cgtod(fs, cyl)), fs->fs_cgsize, cred, &bp)) != 0) { brelse(bp); softdep_unmount(mp); return (error); } cgp = (struct cg *)bp->b_data; cstotal.cs_nffree += cgp->cg_cs.cs_nffree; cstotal.cs_nbfree += cgp->cg_cs.cs_nbfree; cstotal.cs_nifree += cgp->cg_cs.cs_nifree; cstotal.cs_ndir += cgp->cg_cs.cs_ndir; fs->fs_cs(fs, cyl) = cgp->cg_cs; brelse(bp); } #ifdef INVARIANTS if (bcmp(&cstotal, &fs->fs_cstotal, sizeof cstotal)) printf("%s: superblock summary recomputed\n", fs->fs_fsmnt); #endif bcopy(&cstotal, &fs->fs_cstotal, sizeof cstotal); return (0); } void softdep_unmount(mp) struct mount *mp; { struct ufsmount *ump; #ifdef INVARIANTS int i; #endif KASSERT(MOUNTEDSOFTDEP(mp) != 0, ("softdep_unmount called on non-softdep filesystem")); ump = VFSTOUFS(mp); MNT_ILOCK(mp); mp->mnt_flag &= ~MNT_SOFTDEP; if (MOUNTEDSUJ(mp) == 0) { MNT_IUNLOCK(mp); } else { mp->mnt_flag &= ~MNT_SUJ; MNT_IUNLOCK(mp); journal_unmount(ump); } /* * Shut down our flushing thread. Check for NULL is if * softdep_mount errors out before the thread has been created. */ if (ump->softdep_flushtd != NULL) { ACQUIRE_LOCK(ump); ump->softdep_flags |= FLUSH_EXIT; wakeup(&ump->softdep_flushtd); msleep(&ump->softdep_flags, LOCK_PTR(ump), PVM | PDROP, "sdwait", 0); KASSERT((ump->softdep_flags & FLUSH_EXIT) == 0, ("Thread shutdown failed")); } /* * Free up our resources. */ ACQUIRE_GBLLOCK(&lk); TAILQ_REMOVE(&softdepmounts, ump->um_softdep, sd_next); FREE_GBLLOCK(&lk); rw_destroy(LOCK_PTR(ump)); hashdestroy(ump->pagedep_hashtbl, M_PAGEDEP, ump->pagedep_hash_size); hashdestroy(ump->inodedep_hashtbl, M_INODEDEP, ump->inodedep_hash_size); hashdestroy(ump->newblk_hashtbl, M_NEWBLK, ump->newblk_hash_size); hashdestroy(ump->bmsafemap_hashtbl, M_BMSAFEMAP, ump->bmsafemap_hash_size); free(ump->indir_hashtbl, M_FREEWORK); #ifdef INVARIANTS for (i = 0; i <= D_LAST; i++) { KASSERT(ump->softdep_curdeps[i] == 0, ("Unmount %s: Dep type %s != 0 (%ld)", ump->um_fs->fs_fsmnt, TYPENAME(i), ump->softdep_curdeps[i])); KASSERT(LIST_EMPTY(&ump->softdep_alldeps[i]), ("Unmount %s: Dep type %s not empty (%p)", ump->um_fs->fs_fsmnt, TYPENAME(i), LIST_FIRST(&ump->softdep_alldeps[i]))); } #endif free(ump->um_softdep, M_MOUNTDATA); } static struct jblocks * jblocks_create(void) { struct jblocks *jblocks; jblocks = malloc(sizeof(*jblocks), M_JBLOCKS, M_WAITOK | M_ZERO); TAILQ_INIT(&jblocks->jb_segs); jblocks->jb_avail = 10; jblocks->jb_extent = malloc(sizeof(struct jextent) * jblocks->jb_avail, M_JBLOCKS, M_WAITOK | M_ZERO); return (jblocks); } static ufs2_daddr_t jblocks_alloc(jblocks, bytes, actual) struct jblocks *jblocks; int bytes; int *actual; { ufs2_daddr_t daddr; struct jextent *jext; int freecnt; int blocks; blocks = bytes / DEV_BSIZE; jext = &jblocks->jb_extent[jblocks->jb_head]; freecnt = jext->je_blocks - jblocks->jb_off; if (freecnt == 0) { jblocks->jb_off = 0; if (++jblocks->jb_head > jblocks->jb_used) jblocks->jb_head = 0; jext = &jblocks->jb_extent[jblocks->jb_head]; freecnt = jext->je_blocks; } if (freecnt > blocks) freecnt = blocks; *actual = freecnt * DEV_BSIZE; daddr = jext->je_daddr + jblocks->jb_off; jblocks->jb_off += freecnt; jblocks->jb_free -= freecnt; return (daddr); } static void jblocks_free(jblocks, mp, bytes) struct jblocks *jblocks; struct mount *mp; int bytes; { LOCK_OWNED(VFSTOUFS(mp)); jblocks->jb_free += bytes / DEV_BSIZE; if (jblocks->jb_suspended) worklist_speedup(mp); wakeup(jblocks); } static void jblocks_destroy(jblocks) struct jblocks *jblocks; { if (jblocks->jb_extent) free(jblocks->jb_extent, M_JBLOCKS); free(jblocks, M_JBLOCKS); } static void jblocks_add(jblocks, daddr, blocks) struct jblocks *jblocks; ufs2_daddr_t daddr; int blocks; { struct jextent *jext; jblocks->jb_blocks += blocks; jblocks->jb_free += blocks; jext = &jblocks->jb_extent[jblocks->jb_used]; /* Adding the first block. */ if (jext->je_daddr == 0) { jext->je_daddr = daddr; jext->je_blocks = blocks; return; } /* Extending the last extent. */ if (jext->je_daddr + jext->je_blocks == daddr) { jext->je_blocks += blocks; return; } /* Adding a new extent. */ if (++jblocks->jb_used == jblocks->jb_avail) { jblocks->jb_avail *= 2; jext = malloc(sizeof(struct jextent) * jblocks->jb_avail, M_JBLOCKS, M_WAITOK | M_ZERO); memcpy(jext, jblocks->jb_extent, sizeof(struct jextent) * jblocks->jb_used); free(jblocks->jb_extent, M_JBLOCKS); jblocks->jb_extent = jext; } jext = &jblocks->jb_extent[jblocks->jb_used]; jext->je_daddr = daddr; jext->je_blocks = blocks; return; } int softdep_journal_lookup(mp, vpp) struct mount *mp; struct vnode **vpp; { struct componentname cnp; struct vnode *dvp; ino_t sujournal; int error; error = VFS_VGET(mp, UFS_ROOTINO, LK_EXCLUSIVE, &dvp); if (error) return (error); bzero(&cnp, sizeof(cnp)); cnp.cn_nameiop = LOOKUP; cnp.cn_flags = ISLASTCN; cnp.cn_thread = curthread; cnp.cn_cred = curthread->td_ucred; cnp.cn_pnbuf = SUJ_FILE; cnp.cn_nameptr = SUJ_FILE; cnp.cn_namelen = strlen(SUJ_FILE); error = ufs_lookup_ino(dvp, NULL, &cnp, &sujournal); vput(dvp); if (error != 0) return (error); error = VFS_VGET(mp, sujournal, LK_EXCLUSIVE, vpp); return (error); } /* * Open and verify the journal file. */ static int journal_mount(mp, fs, cred) struct mount *mp; struct fs *fs; struct ucred *cred; { struct jblocks *jblocks; struct ufsmount *ump; struct vnode *vp; struct inode *ip; ufs2_daddr_t blkno; int bcount; int error; int i; ump = VFSTOUFS(mp); ump->softdep_journal_tail = NULL; ump->softdep_on_journal = 0; ump->softdep_accdeps = 0; ump->softdep_req = 0; ump->softdep_jblocks = NULL; error = softdep_journal_lookup(mp, &vp); if (error != 0) { printf("Failed to find journal. Use tunefs to create one\n"); return (error); } ip = VTOI(vp); if (ip->i_size < SUJ_MIN) { error = ENOSPC; goto out; } bcount = lblkno(fs, ip->i_size); /* Only use whole blocks. */ jblocks = jblocks_create(); for (i = 0; i < bcount; i++) { error = ufs_bmaparray(vp, i, &blkno, NULL, NULL, NULL); if (error) break; jblocks_add(jblocks, blkno, fsbtodb(fs, fs->fs_frag)); } if (error) { jblocks_destroy(jblocks); goto out; } jblocks->jb_low = jblocks->jb_free / 3; /* Reserve 33%. */ jblocks->jb_min = jblocks->jb_free / 10; /* Suspend at 10%. */ ump->softdep_jblocks = jblocks; out: if (error == 0) { MNT_ILOCK(mp); mp->mnt_flag |= MNT_SUJ; mp->mnt_flag &= ~MNT_SOFTDEP; MNT_IUNLOCK(mp); /* * Only validate the journal contents if the * filesystem is clean, otherwise we write the logs * but they'll never be used. If the filesystem was * still dirty when we mounted it the journal is * invalid and a new journal can only be valid if it * starts from a clean mount. */ if (fs->fs_clean) { DIP_SET(ip, i_modrev, fs->fs_mtime); ip->i_flags |= IN_MODIFIED; ffs_update(vp, 1); } } vput(vp); return (error); } static void journal_unmount(ump) struct ufsmount *ump; { if (ump->softdep_jblocks) jblocks_destroy(ump->softdep_jblocks); ump->softdep_jblocks = NULL; } /* * Called when a journal record is ready to be written. Space is allocated * and the journal entry is created when the journal is flushed to stable * store. */ static void add_to_journal(wk) struct worklist *wk; { struct ufsmount *ump; ump = VFSTOUFS(wk->wk_mp); LOCK_OWNED(ump); if (wk->wk_state & ONWORKLIST) panic("add_to_journal: %s(0x%X) already on list", TYPENAME(wk->wk_type), wk->wk_state); wk->wk_state |= ONWORKLIST | DEPCOMPLETE; if (LIST_EMPTY(&ump->softdep_journal_pending)) { ump->softdep_jblocks->jb_age = ticks; LIST_INSERT_HEAD(&ump->softdep_journal_pending, wk, wk_list); } else LIST_INSERT_AFTER(ump->softdep_journal_tail, wk, wk_list); ump->softdep_journal_tail = wk; ump->softdep_on_journal += 1; } /* * Remove an arbitrary item for the journal worklist maintain the tail * pointer. This happens when a new operation obviates the need to * journal an old operation. */ static void remove_from_journal(wk) struct worklist *wk; { struct ufsmount *ump; ump = VFSTOUFS(wk->wk_mp); LOCK_OWNED(ump); #ifdef INVARIANTS { struct worklist *wkn; LIST_FOREACH(wkn, &ump->softdep_journal_pending, wk_list) if (wkn == wk) break; if (wkn == NULL) panic("remove_from_journal: %p is not in journal", wk); } #endif /* * We emulate a TAILQ to save space in most structures which do not * require TAILQ semantics. Here we must update the tail position * when removing the tail which is not the final entry. This works * only if the worklist linkage are at the beginning of the structure. */ if (ump->softdep_journal_tail == wk) ump->softdep_journal_tail = (struct worklist *)wk->wk_list.le_prev; WORKLIST_REMOVE(wk); ump->softdep_on_journal -= 1; } /* * Check for journal space as well as dependency limits so the prelink * code can throttle both journaled and non-journaled filesystems. * Threshold is 0 for low and 1 for min. */ static int journal_space(ump, thresh) struct ufsmount *ump; int thresh; { struct jblocks *jblocks; int limit, avail; jblocks = ump->softdep_jblocks; if (jblocks == NULL) return (1); /* * We use a tighter restriction here to prevent request_cleanup() * running in threads from running into locks we currently hold. * We have to be over the limit and our filesystem has to be * responsible for more than our share of that usage. */ limit = (max_softdeps / 10) * 9; if (dep_current[D_INODEDEP] > limit && ump->softdep_curdeps[D_INODEDEP] > limit / stat_flush_threads) return (0); if (thresh) thresh = jblocks->jb_min; else thresh = jblocks->jb_low; avail = (ump->softdep_on_journal * JREC_SIZE) / DEV_BSIZE; avail = jblocks->jb_free - avail; return (avail > thresh); } static void journal_suspend(ump) struct ufsmount *ump; { struct jblocks *jblocks; struct mount *mp; bool set; mp = UFSTOVFS(ump); if ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0) return; jblocks = ump->softdep_jblocks; vfs_op_enter(mp); set = false; MNT_ILOCK(mp); if ((mp->mnt_kern_flag & MNTK_SUSPEND) == 0) { stat_journal_min++; mp->mnt_kern_flag |= MNTK_SUSPEND; mp->mnt_susp_owner = ump->softdep_flushtd; set = true; } jblocks->jb_suspended = 1; MNT_IUNLOCK(mp); if (!set) vfs_op_exit(mp); } static int journal_unsuspend(struct ufsmount *ump) { struct jblocks *jblocks; struct mount *mp; mp = UFSTOVFS(ump); jblocks = ump->softdep_jblocks; if (jblocks != NULL && jblocks->jb_suspended && journal_space(ump, jblocks->jb_min)) { jblocks->jb_suspended = 0; FREE_LOCK(ump); mp->mnt_susp_owner = curthread; vfs_write_resume(mp, 0); ACQUIRE_LOCK(ump); return (1); } return (0); } /* * Called before any allocation function to be certain that there is * sufficient space in the journal prior to creating any new records. * Since in the case of block allocation we may have multiple locked * buffers at the time of the actual allocation we can not block * when the journal records are created. Doing so would create a deadlock * if any of these buffers needed to be flushed to reclaim space. Instead * we require a sufficiently large amount of available space such that * each thread in the system could have passed this allocation check and * still have sufficient free space. With 20% of a minimum journal size * of 1MB we have 6553 records available. */ int softdep_prealloc(vp, waitok) struct vnode *vp; int waitok; { struct ufsmount *ump; KASSERT(MOUNTEDSOFTDEP(vp->v_mount) != 0, ("softdep_prealloc called on non-softdep filesystem")); /* * Nothing to do if we are not running journaled soft updates. * If we currently hold the snapshot lock, we must avoid * handling other resources that could cause deadlock. Do not * touch quotas vnode since it is typically recursed with * other vnode locks held. */ if (DOINGSUJ(vp) == 0 || IS_SNAPSHOT(VTOI(vp)) || (vp->v_vflag & VV_SYSTEM) != 0) return (0); ump = VFSTOUFS(vp->v_mount); ACQUIRE_LOCK(ump); if (journal_space(ump, 0)) { FREE_LOCK(ump); return (0); } stat_journal_low++; FREE_LOCK(ump); if (waitok == MNT_NOWAIT) return (ENOSPC); /* * Attempt to sync this vnode once to flush any journal * work attached to it. */ if ((curthread->td_pflags & TDP_COWINPROGRESS) == 0) ffs_syncvnode(vp, waitok, 0); ACQUIRE_LOCK(ump); process_removes(vp); process_truncates(vp); if (journal_space(ump, 0) == 0) { softdep_speedup(ump); if (journal_space(ump, 1) == 0) journal_suspend(ump); } FREE_LOCK(ump); return (0); } /* * Before adjusting a link count on a vnode verify that we have sufficient * journal space. If not, process operations that depend on the currently * locked pair of vnodes to try to flush space as the syncer, buf daemon, * and softdep flush threads can not acquire these locks to reclaim space. */ static void softdep_prelink(dvp, vp) struct vnode *dvp; struct vnode *vp; { struct ufsmount *ump; ump = VFSTOUFS(dvp->v_mount); LOCK_OWNED(ump); /* * Nothing to do if we have sufficient journal space. * If we currently hold the snapshot lock, we must avoid * handling other resources that could cause deadlock. */ if (journal_space(ump, 0) || (vp && IS_SNAPSHOT(VTOI(vp)))) return; stat_journal_low++; FREE_LOCK(ump); if (vp) ffs_syncvnode(vp, MNT_NOWAIT, 0); ffs_syncvnode(dvp, MNT_WAIT, 0); ACQUIRE_LOCK(ump); /* Process vp before dvp as it may create .. removes. */ if (vp) { process_removes(vp); process_truncates(vp); } process_removes(dvp); process_truncates(dvp); softdep_speedup(ump); process_worklist_item(UFSTOVFS(ump), 2, LK_NOWAIT); if (journal_space(ump, 0) == 0) { softdep_speedup(ump); if (journal_space(ump, 1) == 0) journal_suspend(ump); } } static void jseg_write(ump, jseg, data) struct ufsmount *ump; struct jseg *jseg; uint8_t *data; { struct jsegrec *rec; rec = (struct jsegrec *)data; rec->jsr_seq = jseg->js_seq; rec->jsr_oldest = jseg->js_oldseq; rec->jsr_cnt = jseg->js_cnt; rec->jsr_blocks = jseg->js_size / ump->um_devvp->v_bufobj.bo_bsize; rec->jsr_crc = 0; rec->jsr_time = ump->um_fs->fs_mtime; } static inline void inoref_write(inoref, jseg, rec) struct inoref *inoref; struct jseg *jseg; struct jrefrec *rec; { inoref->if_jsegdep->jd_seg = jseg; rec->jr_ino = inoref->if_ino; rec->jr_parent = inoref->if_parent; rec->jr_nlink = inoref->if_nlink; rec->jr_mode = inoref->if_mode; rec->jr_diroff = inoref->if_diroff; } static void jaddref_write(jaddref, jseg, data) struct jaddref *jaddref; struct jseg *jseg; uint8_t *data; { struct jrefrec *rec; rec = (struct jrefrec *)data; rec->jr_op = JOP_ADDREF; inoref_write(&jaddref->ja_ref, jseg, rec); } static void jremref_write(jremref, jseg, data) struct jremref *jremref; struct jseg *jseg; uint8_t *data; { struct jrefrec *rec; rec = (struct jrefrec *)data; rec->jr_op = JOP_REMREF; inoref_write(&jremref->jr_ref, jseg, rec); } static void jmvref_write(jmvref, jseg, data) struct jmvref *jmvref; struct jseg *jseg; uint8_t *data; { struct jmvrec *rec; rec = (struct jmvrec *)data; rec->jm_op = JOP_MVREF; rec->jm_ino = jmvref->jm_ino; rec->jm_parent = jmvref->jm_parent; rec->jm_oldoff = jmvref->jm_oldoff; rec->jm_newoff = jmvref->jm_newoff; } static void jnewblk_write(jnewblk, jseg, data) struct jnewblk *jnewblk; struct jseg *jseg; uint8_t *data; { struct jblkrec *rec; jnewblk->jn_jsegdep->jd_seg = jseg; rec = (struct jblkrec *)data; rec->jb_op = JOP_NEWBLK; rec->jb_ino = jnewblk->jn_ino; rec->jb_blkno = jnewblk->jn_blkno; rec->jb_lbn = jnewblk->jn_lbn; rec->jb_frags = jnewblk->jn_frags; rec->jb_oldfrags = jnewblk->jn_oldfrags; } static void jfreeblk_write(jfreeblk, jseg, data) struct jfreeblk *jfreeblk; struct jseg *jseg; uint8_t *data; { struct jblkrec *rec; jfreeblk->jf_dep.jb_jsegdep->jd_seg = jseg; rec = (struct jblkrec *)data; rec->jb_op = JOP_FREEBLK; rec->jb_ino = jfreeblk->jf_ino; rec->jb_blkno = jfreeblk->jf_blkno; rec->jb_lbn = jfreeblk->jf_lbn; rec->jb_frags = jfreeblk->jf_frags; rec->jb_oldfrags = 0; } static void jfreefrag_write(jfreefrag, jseg, data) struct jfreefrag *jfreefrag; struct jseg *jseg; uint8_t *data; { struct jblkrec *rec; jfreefrag->fr_jsegdep->jd_seg = jseg; rec = (struct jblkrec *)data; rec->jb_op = JOP_FREEBLK; rec->jb_ino = jfreefrag->fr_ino; rec->jb_blkno = jfreefrag->fr_blkno; rec->jb_lbn = jfreefrag->fr_lbn; rec->jb_frags = jfreefrag->fr_frags; rec->jb_oldfrags = 0; } static void jtrunc_write(jtrunc, jseg, data) struct jtrunc *jtrunc; struct jseg *jseg; uint8_t *data; { struct jtrncrec *rec; jtrunc->jt_dep.jb_jsegdep->jd_seg = jseg; rec = (struct jtrncrec *)data; rec->jt_op = JOP_TRUNC; rec->jt_ino = jtrunc->jt_ino; rec->jt_size = jtrunc->jt_size; rec->jt_extsize = jtrunc->jt_extsize; } static void jfsync_write(jfsync, jseg, data) struct jfsync *jfsync; struct jseg *jseg; uint8_t *data; { struct jtrncrec *rec; rec = (struct jtrncrec *)data; rec->jt_op = JOP_SYNC; rec->jt_ino = jfsync->jfs_ino; rec->jt_size = jfsync->jfs_size; rec->jt_extsize = jfsync->jfs_extsize; } static void softdep_flushjournal(mp) struct mount *mp; { struct jblocks *jblocks; struct ufsmount *ump; if (MOUNTEDSUJ(mp) == 0) return; ump = VFSTOUFS(mp); jblocks = ump->softdep_jblocks; ACQUIRE_LOCK(ump); while (ump->softdep_on_journal) { jblocks->jb_needseg = 1; softdep_process_journal(mp, NULL, MNT_WAIT); } FREE_LOCK(ump); } static void softdep_synchronize_completed(struct bio *); static void softdep_synchronize(struct bio *, struct ufsmount *, void *); static void softdep_synchronize_completed(bp) struct bio *bp; { struct jseg *oldest; struct jseg *jseg; struct ufsmount *ump; /* * caller1 marks the last segment written before we issued the * synchronize cache. */ jseg = bp->bio_caller1; if (jseg == NULL) { g_destroy_bio(bp); return; } ump = VFSTOUFS(jseg->js_list.wk_mp); ACQUIRE_LOCK(ump); oldest = NULL; /* * Mark all the journal entries waiting on the synchronize cache * as completed so they may continue on. */ while (jseg != NULL && (jseg->js_state & COMPLETE) == 0) { jseg->js_state |= COMPLETE; oldest = jseg; jseg = TAILQ_PREV(jseg, jseglst, js_next); } /* * Restart deferred journal entry processing from the oldest * completed jseg. */ if (oldest) complete_jsegs(oldest); FREE_LOCK(ump); g_destroy_bio(bp); } /* * Send BIO_FLUSH/SYNCHRONIZE CACHE to the device to enforce write ordering * barriers. The journal must be written prior to any blocks that depend * on it and the journal can not be released until the blocks have be * written. This code handles both barriers simultaneously. */ static void softdep_synchronize(bp, ump, caller1) struct bio *bp; struct ufsmount *ump; void *caller1; { bp->bio_cmd = BIO_FLUSH; bp->bio_flags |= BIO_ORDERED; bp->bio_data = NULL; bp->bio_offset = ump->um_cp->provider->mediasize; bp->bio_length = 0; bp->bio_done = softdep_synchronize_completed; bp->bio_caller1 = caller1; g_io_request(bp, ump->um_cp); } /* * Flush some journal records to disk. */ static void softdep_process_journal(mp, needwk, flags) struct mount *mp; struct worklist *needwk; int flags; { struct jblocks *jblocks; struct ufsmount *ump; struct worklist *wk; struct jseg *jseg; struct buf *bp; struct bio *bio; uint8_t *data; struct fs *fs; int shouldflush; int segwritten; int jrecmin; /* Minimum records per block. */ int jrecmax; /* Maximum records per block. */ int size; int cnt; int off; int devbsize; if (MOUNTEDSUJ(mp) == 0) return; shouldflush = softdep_flushcache; bio = NULL; jseg = NULL; ump = VFSTOUFS(mp); LOCK_OWNED(ump); fs = ump->um_fs; jblocks = ump->softdep_jblocks; devbsize = ump->um_devvp->v_bufobj.bo_bsize; /* * We write anywhere between a disk block and fs block. The upper * bound is picked to prevent buffer cache fragmentation and limit * processing time per I/O. */ jrecmin = (devbsize / JREC_SIZE) - 1; /* -1 for seg header */ jrecmax = (fs->fs_bsize / devbsize) * jrecmin; segwritten = 0; for (;;) { cnt = ump->softdep_on_journal; /* * Criteria for writing a segment: * 1) We have a full block. * 2) We're called from jwait() and haven't found the * journal item yet. * 3) Always write if needseg is set. * 4) If we are called from process_worklist and have * not yet written anything we write a partial block * to enforce a 1 second maximum latency on journal * entries. */ if (cnt < (jrecmax - 1) && needwk == NULL && jblocks->jb_needseg == 0 && (segwritten || cnt == 0)) break; cnt++; /* * Verify some free journal space. softdep_prealloc() should * guarantee that we don't run out so this is indicative of * a problem with the flow control. Try to recover * gracefully in any event. */ while (jblocks->jb_free == 0) { if (flags != MNT_WAIT) break; printf("softdep: Out of journal space!\n"); softdep_speedup(ump); msleep(jblocks, LOCK_PTR(ump), PRIBIO, "jblocks", hz); } FREE_LOCK(ump); jseg = malloc(sizeof(*jseg), M_JSEG, M_SOFTDEP_FLAGS); workitem_alloc(&jseg->js_list, D_JSEG, mp); LIST_INIT(&jseg->js_entries); LIST_INIT(&jseg->js_indirs); jseg->js_state = ATTACHED; if (shouldflush == 0) jseg->js_state |= COMPLETE; else if (bio == NULL) bio = g_alloc_bio(); jseg->js_jblocks = jblocks; bp = geteblk(fs->fs_bsize, 0); ACQUIRE_LOCK(ump); /* * If there was a race while we were allocating the block * and jseg the entry we care about was likely written. * We bail out in both the WAIT and NOWAIT case and assume * the caller will loop if the entry it cares about is * not written. */ cnt = ump->softdep_on_journal; if (cnt + jblocks->jb_needseg == 0 || jblocks->jb_free == 0) { bp->b_flags |= B_INVAL | B_NOCACHE; WORKITEM_FREE(jseg, D_JSEG); FREE_LOCK(ump); brelse(bp); ACQUIRE_LOCK(ump); break; } /* * Calculate the disk block size required for the available * records rounded to the min size. */ if (cnt == 0) size = devbsize; else if (cnt < jrecmax) size = howmany(cnt, jrecmin) * devbsize; else size = fs->fs_bsize; /* * Allocate a disk block for this journal data and account * for truncation of the requested size if enough contiguous * space was not available. */ bp->b_blkno = jblocks_alloc(jblocks, size, &size); bp->b_lblkno = bp->b_blkno; bp->b_offset = bp->b_blkno * DEV_BSIZE; bp->b_bcount = size; bp->b_flags &= ~B_INVAL; bp->b_flags |= B_VALIDSUSPWRT | B_NOCOPY; /* * Initialize our jseg with cnt records. Assign the next * sequence number to it and link it in-order. */ cnt = MIN(cnt, (size / devbsize) * jrecmin); jseg->js_buf = bp; jseg->js_cnt = cnt; jseg->js_refs = cnt + 1; /* Self ref. */ jseg->js_size = size; jseg->js_seq = jblocks->jb_nextseq++; if (jblocks->jb_oldestseg == NULL) jblocks->jb_oldestseg = jseg; jseg->js_oldseq = jblocks->jb_oldestseg->js_seq; TAILQ_INSERT_TAIL(&jblocks->jb_segs, jseg, js_next); if (jblocks->jb_writeseg == NULL) jblocks->jb_writeseg = jseg; /* * Start filling in records from the pending list. */ data = bp->b_data; off = 0; /* * Always put a header on the first block. * XXX As with below, there might not be a chance to get * into the loop. Ensure that something valid is written. */ jseg_write(ump, jseg, data); off += JREC_SIZE; data = bp->b_data + off; /* * XXX Something is wrong here. There's no work to do, * but we need to perform and I/O and allow it to complete * anyways. */ if (LIST_EMPTY(&ump->softdep_journal_pending)) stat_emptyjblocks++; while ((wk = LIST_FIRST(&ump->softdep_journal_pending)) != NULL) { if (cnt == 0) break; /* Place a segment header on every device block. */ if ((off % devbsize) == 0) { jseg_write(ump, jseg, data); off += JREC_SIZE; data = bp->b_data + off; } if (wk == needwk) needwk = NULL; remove_from_journal(wk); wk->wk_state |= INPROGRESS; WORKLIST_INSERT(&jseg->js_entries, wk); switch (wk->wk_type) { case D_JADDREF: jaddref_write(WK_JADDREF(wk), jseg, data); break; case D_JREMREF: jremref_write(WK_JREMREF(wk), jseg, data); break; case D_JMVREF: jmvref_write(WK_JMVREF(wk), jseg, data); break; case D_JNEWBLK: jnewblk_write(WK_JNEWBLK(wk), jseg, data); break; case D_JFREEBLK: jfreeblk_write(WK_JFREEBLK(wk), jseg, data); break; case D_JFREEFRAG: jfreefrag_write(WK_JFREEFRAG(wk), jseg, data); break; case D_JTRUNC: jtrunc_write(WK_JTRUNC(wk), jseg, data); break; case D_JFSYNC: jfsync_write(WK_JFSYNC(wk), jseg, data); break; default: panic("process_journal: Unknown type %s", TYPENAME(wk->wk_type)); /* NOTREACHED */ } off += JREC_SIZE; data = bp->b_data + off; cnt--; } /* Clear any remaining space so we don't leak kernel data */ if (size > off) bzero(data, size - off); /* * Write this one buffer and continue. */ segwritten = 1; jblocks->jb_needseg = 0; WORKLIST_INSERT(&bp->b_dep, &jseg->js_list); FREE_LOCK(ump); bp->b_xflags |= BX_CVTENXIO; pbgetvp(ump->um_devvp, bp); /* * We only do the blocking wait once we find the journal * entry we're looking for. */ if (needwk == NULL && flags == MNT_WAIT) bwrite(bp); else bawrite(bp); ACQUIRE_LOCK(ump); } /* * If we wrote a segment issue a synchronize cache so the journal * is reflected on disk before the data is written. Since reclaiming * journal space also requires writing a journal record this * process also enforces a barrier before reclamation. */ if (segwritten && shouldflush) { softdep_synchronize(bio, ump, TAILQ_LAST(&jblocks->jb_segs, jseglst)); } else if (bio) g_destroy_bio(bio); /* * If we've suspended the filesystem because we ran out of journal * space either try to sync it here to make some progress or * unsuspend it if we already have. */ if (flags == 0 && jblocks->jb_suspended) { if (journal_unsuspend(ump)) return; FREE_LOCK(ump); VFS_SYNC(mp, MNT_NOWAIT); ffs_sbupdate(ump, MNT_WAIT, 0); ACQUIRE_LOCK(ump); } } /* * Complete a jseg, allowing all dependencies awaiting journal writes * to proceed. Each journal dependency also attaches a jsegdep to dependent * structures so that the journal segment can be freed to reclaim space. */ static void complete_jseg(jseg) struct jseg *jseg; { struct worklist *wk; struct jmvref *jmvref; #ifdef INVARIANTS int i = 0; #endif while ((wk = LIST_FIRST(&jseg->js_entries)) != NULL) { WORKLIST_REMOVE(wk); wk->wk_state &= ~INPROGRESS; wk->wk_state |= COMPLETE; KASSERT(i++ < jseg->js_cnt, ("handle_written_jseg: overflow %d >= %d", i - 1, jseg->js_cnt)); switch (wk->wk_type) { case D_JADDREF: handle_written_jaddref(WK_JADDREF(wk)); break; case D_JREMREF: handle_written_jremref(WK_JREMREF(wk)); break; case D_JMVREF: rele_jseg(jseg); /* No jsegdep. */ jmvref = WK_JMVREF(wk); LIST_REMOVE(jmvref, jm_deps); if ((jmvref->jm_pagedep->pd_state & ONWORKLIST) == 0) free_pagedep(jmvref->jm_pagedep); WORKITEM_FREE(jmvref, D_JMVREF); break; case D_JNEWBLK: handle_written_jnewblk(WK_JNEWBLK(wk)); break; case D_JFREEBLK: handle_written_jblkdep(&WK_JFREEBLK(wk)->jf_dep); break; case D_JTRUNC: handle_written_jblkdep(&WK_JTRUNC(wk)->jt_dep); break; case D_JFSYNC: rele_jseg(jseg); /* No jsegdep. */ WORKITEM_FREE(wk, D_JFSYNC); break; case D_JFREEFRAG: handle_written_jfreefrag(WK_JFREEFRAG(wk)); break; default: panic("handle_written_jseg: Unknown type %s", TYPENAME(wk->wk_type)); /* NOTREACHED */ } } /* Release the self reference so the structure may be freed. */ rele_jseg(jseg); } /* * Determine which jsegs are ready for completion processing. Waits for * synchronize cache to complete as well as forcing in-order completion * of journal entries. */ static void complete_jsegs(jseg) struct jseg *jseg; { struct jblocks *jblocks; struct jseg *jsegn; jblocks = jseg->js_jblocks; /* * Don't allow out of order completions. If this isn't the first * block wait for it to write before we're done. */ if (jseg != jblocks->jb_writeseg) return; /* Iterate through available jsegs processing their entries. */ while (jseg && (jseg->js_state & ALLCOMPLETE) == ALLCOMPLETE) { jblocks->jb_oldestwrseq = jseg->js_oldseq; jsegn = TAILQ_NEXT(jseg, js_next); complete_jseg(jseg); jseg = jsegn; } jblocks->jb_writeseg = jseg; /* * Attempt to free jsegs now that oldestwrseq may have advanced. */ free_jsegs(jblocks); } /* * Mark a jseg as DEPCOMPLETE and throw away the buffer. Attempt to handle * the final completions. */ static void handle_written_jseg(jseg, bp) struct jseg *jseg; struct buf *bp; { if (jseg->js_refs == 0) panic("handle_written_jseg: No self-reference on %p", jseg); jseg->js_state |= DEPCOMPLETE; /* * We'll never need this buffer again, set flags so it will be * discarded. */ bp->b_flags |= B_INVAL | B_NOCACHE; pbrelvp(bp); complete_jsegs(jseg); } static inline struct jsegdep * inoref_jseg(inoref) struct inoref *inoref; { struct jsegdep *jsegdep; jsegdep = inoref->if_jsegdep; inoref->if_jsegdep = NULL; return (jsegdep); } /* * Called once a jremref has made it to stable store. The jremref is marked * complete and we attempt to free it. Any pagedeps writes sleeping waiting * for the jremref to complete will be awoken by free_jremref. */ static void handle_written_jremref(jremref) struct jremref *jremref; { struct inodedep *inodedep; struct jsegdep *jsegdep; struct dirrem *dirrem; /* Grab the jsegdep. */ jsegdep = inoref_jseg(&jremref->jr_ref); /* * Remove us from the inoref list. */ if (inodedep_lookup(jremref->jr_list.wk_mp, jremref->jr_ref.if_ino, 0, &inodedep) == 0) panic("handle_written_jremref: Lost inodedep"); TAILQ_REMOVE(&inodedep->id_inoreflst, &jremref->jr_ref, if_deps); /* * Complete the dirrem. */ dirrem = jremref->jr_dirrem; jremref->jr_dirrem = NULL; LIST_REMOVE(jremref, jr_deps); jsegdep->jd_state |= jremref->jr_state & MKDIR_PARENT; jwork_insert(&dirrem->dm_jwork, jsegdep); if (LIST_EMPTY(&dirrem->dm_jremrefhd) && (dirrem->dm_state & COMPLETE) != 0) add_to_worklist(&dirrem->dm_list, 0); free_jremref(jremref); } /* * Called once a jaddref has made it to stable store. The dependency is * marked complete and any dependent structures are added to the inode * bufwait list to be completed as soon as it is written. If a bitmap write * depends on this entry we move the inode into the inodedephd of the * bmsafemap dependency and attempt to remove the jaddref from the bmsafemap. */ static void handle_written_jaddref(jaddref) struct jaddref *jaddref; { struct jsegdep *jsegdep; struct inodedep *inodedep; struct diradd *diradd; struct mkdir *mkdir; /* Grab the jsegdep. */ jsegdep = inoref_jseg(&jaddref->ja_ref); mkdir = NULL; diradd = NULL; if (inodedep_lookup(jaddref->ja_list.wk_mp, jaddref->ja_ino, 0, &inodedep) == 0) panic("handle_written_jaddref: Lost inodedep."); if (jaddref->ja_diradd == NULL) panic("handle_written_jaddref: No dependency"); if (jaddref->ja_diradd->da_list.wk_type == D_DIRADD) { diradd = jaddref->ja_diradd; WORKLIST_INSERT(&inodedep->id_bufwait, &diradd->da_list); } else if (jaddref->ja_state & MKDIR_PARENT) { mkdir = jaddref->ja_mkdir; WORKLIST_INSERT(&inodedep->id_bufwait, &mkdir->md_list); } else if (jaddref->ja_state & MKDIR_BODY) mkdir = jaddref->ja_mkdir; else panic("handle_written_jaddref: Unknown dependency %p", jaddref->ja_diradd); jaddref->ja_diradd = NULL; /* also clears ja_mkdir */ /* * Remove us from the inode list. */ TAILQ_REMOVE(&inodedep->id_inoreflst, &jaddref->ja_ref, if_deps); /* * The mkdir may be waiting on the jaddref to clear before freeing. */ if (mkdir) { KASSERT(mkdir->md_list.wk_type == D_MKDIR, ("handle_written_jaddref: Incorrect type for mkdir %s", TYPENAME(mkdir->md_list.wk_type))); mkdir->md_jaddref = NULL; diradd = mkdir->md_diradd; mkdir->md_state |= DEPCOMPLETE; complete_mkdir(mkdir); } jwork_insert(&diradd->da_jwork, jsegdep); if (jaddref->ja_state & NEWBLOCK) { inodedep->id_state |= ONDEPLIST; LIST_INSERT_HEAD(&inodedep->id_bmsafemap->sm_inodedephd, inodedep, id_deps); } free_jaddref(jaddref); } /* * Called once a jnewblk journal is written. The allocdirect or allocindir * is placed in the bmsafemap to await notification of a written bitmap. If * the operation was canceled we add the segdep to the appropriate * dependency to free the journal space once the canceling operation * completes. */ static void handle_written_jnewblk(jnewblk) struct jnewblk *jnewblk; { struct bmsafemap *bmsafemap; struct freefrag *freefrag; struct freework *freework; struct jsegdep *jsegdep; struct newblk *newblk; /* Grab the jsegdep. */ jsegdep = jnewblk->jn_jsegdep; jnewblk->jn_jsegdep = NULL; if (jnewblk->jn_dep == NULL) panic("handle_written_jnewblk: No dependency for the segdep."); switch (jnewblk->jn_dep->wk_type) { case D_NEWBLK: case D_ALLOCDIRECT: case D_ALLOCINDIR: /* * Add the written block to the bmsafemap so it can * be notified when the bitmap is on disk. */ newblk = WK_NEWBLK(jnewblk->jn_dep); newblk->nb_jnewblk = NULL; if ((newblk->nb_state & GOINGAWAY) == 0) { bmsafemap = newblk->nb_bmsafemap; newblk->nb_state |= ONDEPLIST; LIST_INSERT_HEAD(&bmsafemap->sm_newblkhd, newblk, nb_deps); } jwork_insert(&newblk->nb_jwork, jsegdep); break; case D_FREEFRAG: /* * A newblock being removed by a freefrag when replaced by * frag extension. */ freefrag = WK_FREEFRAG(jnewblk->jn_dep); freefrag->ff_jdep = NULL; jwork_insert(&freefrag->ff_jwork, jsegdep); break; case D_FREEWORK: /* * A direct block was removed by truncate. */ freework = WK_FREEWORK(jnewblk->jn_dep); freework->fw_jnewblk = NULL; jwork_insert(&freework->fw_freeblks->fb_jwork, jsegdep); break; default: panic("handle_written_jnewblk: Unknown type %d.", jnewblk->jn_dep->wk_type); } jnewblk->jn_dep = NULL; free_jnewblk(jnewblk); } /* * Cancel a jfreefrag that won't be needed, probably due to colliding with * an in-flight allocation that has not yet been committed. Divorce us * from the freefrag and mark it DEPCOMPLETE so that it may be added * to the worklist. */ static void cancel_jfreefrag(jfreefrag) struct jfreefrag *jfreefrag; { struct freefrag *freefrag; if (jfreefrag->fr_jsegdep) { free_jsegdep(jfreefrag->fr_jsegdep); jfreefrag->fr_jsegdep = NULL; } freefrag = jfreefrag->fr_freefrag; jfreefrag->fr_freefrag = NULL; free_jfreefrag(jfreefrag); freefrag->ff_state |= DEPCOMPLETE; CTR1(KTR_SUJ, "cancel_jfreefrag: blkno %jd", freefrag->ff_blkno); } /* * Free a jfreefrag when the parent freefrag is rendered obsolete. */ static void free_jfreefrag(jfreefrag) struct jfreefrag *jfreefrag; { if (jfreefrag->fr_state & INPROGRESS) WORKLIST_REMOVE(&jfreefrag->fr_list); else if (jfreefrag->fr_state & ONWORKLIST) remove_from_journal(&jfreefrag->fr_list); if (jfreefrag->fr_freefrag != NULL) panic("free_jfreefrag: Still attached to a freefrag."); WORKITEM_FREE(jfreefrag, D_JFREEFRAG); } /* * Called when the journal write for a jfreefrag completes. The parent * freefrag is added to the worklist if this completes its dependencies. */ static void handle_written_jfreefrag(jfreefrag) struct jfreefrag *jfreefrag; { struct jsegdep *jsegdep; struct freefrag *freefrag; /* Grab the jsegdep. */ jsegdep = jfreefrag->fr_jsegdep; jfreefrag->fr_jsegdep = NULL; freefrag = jfreefrag->fr_freefrag; if (freefrag == NULL) panic("handle_written_jfreefrag: No freefrag."); freefrag->ff_state |= DEPCOMPLETE; freefrag->ff_jdep = NULL; jwork_insert(&freefrag->ff_jwork, jsegdep); if ((freefrag->ff_state & ALLCOMPLETE) == ALLCOMPLETE) add_to_worklist(&freefrag->ff_list, 0); jfreefrag->fr_freefrag = NULL; free_jfreefrag(jfreefrag); } /* * Called when the journal write for a jfreeblk completes. The jfreeblk * is removed from the freeblks list of pending journal writes and the * jsegdep is moved to the freeblks jwork to be completed when all blocks * have been reclaimed. */ static void handle_written_jblkdep(jblkdep) struct jblkdep *jblkdep; { struct freeblks *freeblks; struct jsegdep *jsegdep; /* Grab the jsegdep. */ jsegdep = jblkdep->jb_jsegdep; jblkdep->jb_jsegdep = NULL; freeblks = jblkdep->jb_freeblks; LIST_REMOVE(jblkdep, jb_deps); jwork_insert(&freeblks->fb_jwork, jsegdep); /* * If the freeblks is all journaled, we can add it to the worklist. */ if (LIST_EMPTY(&freeblks->fb_jblkdephd) && (freeblks->fb_state & ALLCOMPLETE) == ALLCOMPLETE) add_to_worklist(&freeblks->fb_list, WK_NODELAY); free_jblkdep(jblkdep); } static struct jsegdep * newjsegdep(struct worklist *wk) { struct jsegdep *jsegdep; jsegdep = malloc(sizeof(*jsegdep), M_JSEGDEP, M_SOFTDEP_FLAGS); workitem_alloc(&jsegdep->jd_list, D_JSEGDEP, wk->wk_mp); jsegdep->jd_seg = NULL; return (jsegdep); } static struct jmvref * newjmvref(dp, ino, oldoff, newoff) struct inode *dp; ino_t ino; off_t oldoff; off_t newoff; { struct jmvref *jmvref; jmvref = malloc(sizeof(*jmvref), M_JMVREF, M_SOFTDEP_FLAGS); workitem_alloc(&jmvref->jm_list, D_JMVREF, ITOVFS(dp)); jmvref->jm_list.wk_state = ATTACHED | DEPCOMPLETE; jmvref->jm_parent = dp->i_number; jmvref->jm_ino = ino; jmvref->jm_oldoff = oldoff; jmvref->jm_newoff = newoff; return (jmvref); } /* * Allocate a new jremref that tracks the removal of ip from dp with the * directory entry offset of diroff. Mark the entry as ATTACHED and * DEPCOMPLETE as we have all the information required for the journal write * and the directory has already been removed from the buffer. The caller * is responsible for linking the jremref into the pagedep and adding it * to the journal to write. The MKDIR_PARENT flag is set if we're doing * a DOTDOT addition so handle_workitem_remove() can properly assign * the jsegdep when we're done. */ static struct jremref * newjremref(struct dirrem *dirrem, struct inode *dp, struct inode *ip, off_t diroff, nlink_t nlink) { struct jremref *jremref; jremref = malloc(sizeof(*jremref), M_JREMREF, M_SOFTDEP_FLAGS); workitem_alloc(&jremref->jr_list, D_JREMREF, ITOVFS(dp)); jremref->jr_state = ATTACHED; newinoref(&jremref->jr_ref, ip->i_number, dp->i_number, diroff, nlink, ip->i_mode); jremref->jr_dirrem = dirrem; return (jremref); } static inline void newinoref(struct inoref *inoref, ino_t ino, ino_t parent, off_t diroff, nlink_t nlink, uint16_t mode) { inoref->if_jsegdep = newjsegdep(&inoref->if_list); inoref->if_diroff = diroff; inoref->if_ino = ino; inoref->if_parent = parent; inoref->if_nlink = nlink; inoref->if_mode = mode; } /* * Allocate a new jaddref to track the addition of ino to dp at diroff. The * directory offset may not be known until later. The caller is responsible * adding the entry to the journal when this information is available. nlink * should be the link count prior to the addition and mode is only required * to have the correct FMT. */ static struct jaddref * newjaddref(struct inode *dp, ino_t ino, off_t diroff, int16_t nlink, uint16_t mode) { struct jaddref *jaddref; jaddref = malloc(sizeof(*jaddref), M_JADDREF, M_SOFTDEP_FLAGS); workitem_alloc(&jaddref->ja_list, D_JADDREF, ITOVFS(dp)); jaddref->ja_state = ATTACHED; jaddref->ja_mkdir = NULL; newinoref(&jaddref->ja_ref, ino, dp->i_number, diroff, nlink, mode); return (jaddref); } /* * Create a new free dependency for a freework. The caller is responsible * for adjusting the reference count when it has the lock held. The freedep * will track an outstanding bitmap write that will ultimately clear the * freework to continue. */ static struct freedep * newfreedep(struct freework *freework) { struct freedep *freedep; freedep = malloc(sizeof(*freedep), M_FREEDEP, M_SOFTDEP_FLAGS); workitem_alloc(&freedep->fd_list, D_FREEDEP, freework->fw_list.wk_mp); freedep->fd_freework = freework; return (freedep); } /* * Free a freedep structure once the buffer it is linked to is written. If * this is the last reference to the freework schedule it for completion. */ static void free_freedep(freedep) struct freedep *freedep; { struct freework *freework; freework = freedep->fd_freework; freework->fw_freeblks->fb_cgwait--; if (--freework->fw_ref == 0) freework_enqueue(freework); WORKITEM_FREE(freedep, D_FREEDEP); } /* * Allocate a new freework structure that may be a level in an indirect * when parent is not NULL or a top level block when it is. The top level * freework structures are allocated without the per-filesystem lock held * and before the freeblks is visible outside of softdep_setup_freeblocks(). */ static struct freework * newfreework(ump, freeblks, parent, lbn, nb, frags, off, journal) struct ufsmount *ump; struct freeblks *freeblks; struct freework *parent; ufs_lbn_t lbn; ufs2_daddr_t nb; int frags; int off; int journal; { struct freework *freework; freework = malloc(sizeof(*freework), M_FREEWORK, M_SOFTDEP_FLAGS); workitem_alloc(&freework->fw_list, D_FREEWORK, freeblks->fb_list.wk_mp); freework->fw_state = ATTACHED; freework->fw_jnewblk = NULL; freework->fw_freeblks = freeblks; freework->fw_parent = parent; freework->fw_lbn = lbn; freework->fw_blkno = nb; freework->fw_frags = frags; freework->fw_indir = NULL; freework->fw_ref = (MOUNTEDSUJ(UFSTOVFS(ump)) == 0 || lbn >= -UFS_NXADDR) ? 0 : NINDIR(ump->um_fs) + 1; freework->fw_start = freework->fw_off = off; if (journal) newjfreeblk(freeblks, lbn, nb, frags); if (parent == NULL) { ACQUIRE_LOCK(ump); WORKLIST_INSERT(&freeblks->fb_freeworkhd, &freework->fw_list); freeblks->fb_ref++; FREE_LOCK(ump); } return (freework); } /* * Eliminate a jfreeblk for a block that does not need journaling. */ static void cancel_jfreeblk(freeblks, blkno) struct freeblks *freeblks; ufs2_daddr_t blkno; { struct jfreeblk *jfreeblk; struct jblkdep *jblkdep; LIST_FOREACH(jblkdep, &freeblks->fb_jblkdephd, jb_deps) { if (jblkdep->jb_list.wk_type != D_JFREEBLK) continue; jfreeblk = WK_JFREEBLK(&jblkdep->jb_list); if (jfreeblk->jf_blkno == blkno) break; } if (jblkdep == NULL) return; CTR1(KTR_SUJ, "cancel_jfreeblk: blkno %jd", blkno); free_jsegdep(jblkdep->jb_jsegdep); LIST_REMOVE(jblkdep, jb_deps); WORKITEM_FREE(jfreeblk, D_JFREEBLK); } /* * Allocate a new jfreeblk to journal top level block pointer when truncating * a file. The caller must add this to the worklist when the per-filesystem * lock is held. */ static struct jfreeblk * newjfreeblk(freeblks, lbn, blkno, frags) struct freeblks *freeblks; ufs_lbn_t lbn; ufs2_daddr_t blkno; int frags; { struct jfreeblk *jfreeblk; jfreeblk = malloc(sizeof(*jfreeblk), M_JFREEBLK, M_SOFTDEP_FLAGS); workitem_alloc(&jfreeblk->jf_dep.jb_list, D_JFREEBLK, freeblks->fb_list.wk_mp); jfreeblk->jf_dep.jb_jsegdep = newjsegdep(&jfreeblk->jf_dep.jb_list); jfreeblk->jf_dep.jb_freeblks = freeblks; jfreeblk->jf_ino = freeblks->fb_inum; jfreeblk->jf_lbn = lbn; jfreeblk->jf_blkno = blkno; jfreeblk->jf_frags = frags; LIST_INSERT_HEAD(&freeblks->fb_jblkdephd, &jfreeblk->jf_dep, jb_deps); return (jfreeblk); } /* * The journal is only prepared to handle full-size block numbers, so we * have to adjust the record to reflect the change to a full-size block. * For example, suppose we have a block made up of fragments 8-15 and * want to free its last two fragments. We are given a request that says: * FREEBLK ino=5, blkno=14, lbn=0, frags=2, oldfrags=0 * where frags are the number of fragments to free and oldfrags are the * number of fragments to keep. To block align it, we have to change it to * have a valid full-size blkno, so it becomes: * FREEBLK ino=5, blkno=8, lbn=0, frags=2, oldfrags=6 */ static void adjust_newfreework(freeblks, frag_offset) struct freeblks *freeblks; int frag_offset; { struct jfreeblk *jfreeblk; KASSERT((LIST_FIRST(&freeblks->fb_jblkdephd) != NULL && LIST_FIRST(&freeblks->fb_jblkdephd)->jb_list.wk_type == D_JFREEBLK), ("adjust_newfreework: Missing freeblks dependency")); jfreeblk = WK_JFREEBLK(LIST_FIRST(&freeblks->fb_jblkdephd)); jfreeblk->jf_blkno -= frag_offset; jfreeblk->jf_frags += frag_offset; } /* * Allocate a new jtrunc to track a partial truncation. */ static struct jtrunc * newjtrunc(freeblks, size, extsize) struct freeblks *freeblks; off_t size; int extsize; { struct jtrunc *jtrunc; jtrunc = malloc(sizeof(*jtrunc), M_JTRUNC, M_SOFTDEP_FLAGS); workitem_alloc(&jtrunc->jt_dep.jb_list, D_JTRUNC, freeblks->fb_list.wk_mp); jtrunc->jt_dep.jb_jsegdep = newjsegdep(&jtrunc->jt_dep.jb_list); jtrunc->jt_dep.jb_freeblks = freeblks; jtrunc->jt_ino = freeblks->fb_inum; jtrunc->jt_size = size; jtrunc->jt_extsize = extsize; LIST_INSERT_HEAD(&freeblks->fb_jblkdephd, &jtrunc->jt_dep, jb_deps); return (jtrunc); } /* * If we're canceling a new bitmap we have to search for another ref * to move into the bmsafemap dep. This might be better expressed * with another structure. */ static void move_newblock_dep(jaddref, inodedep) struct jaddref *jaddref; struct inodedep *inodedep; { struct inoref *inoref; struct jaddref *jaddrefn; jaddrefn = NULL; for (inoref = TAILQ_NEXT(&jaddref->ja_ref, if_deps); inoref; inoref = TAILQ_NEXT(inoref, if_deps)) { if ((jaddref->ja_state & NEWBLOCK) && inoref->if_list.wk_type == D_JADDREF) { jaddrefn = (struct jaddref *)inoref; break; } } if (jaddrefn == NULL) return; jaddrefn->ja_state &= ~(ATTACHED | UNDONE); jaddrefn->ja_state |= jaddref->ja_state & (ATTACHED | UNDONE | NEWBLOCK); jaddref->ja_state &= ~(ATTACHED | UNDONE | NEWBLOCK); jaddref->ja_state |= ATTACHED; LIST_REMOVE(jaddref, ja_bmdeps); LIST_INSERT_HEAD(&inodedep->id_bmsafemap->sm_jaddrefhd, jaddrefn, ja_bmdeps); } /* * Cancel a jaddref either before it has been written or while it is being * written. This happens when a link is removed before the add reaches * the disk. The jaddref dependency is kept linked into the bmsafemap * and inode to prevent the link count or bitmap from reaching the disk * until handle_workitem_remove() re-adjusts the counts and bitmaps as * required. * * Returns 1 if the canceled addref requires journaling of the remove and * 0 otherwise. */ static int cancel_jaddref(jaddref, inodedep, wkhd) struct jaddref *jaddref; struct inodedep *inodedep; struct workhead *wkhd; { struct inoref *inoref; struct jsegdep *jsegdep; int needsj; KASSERT((jaddref->ja_state & COMPLETE) == 0, ("cancel_jaddref: Canceling complete jaddref")); if (jaddref->ja_state & (INPROGRESS | COMPLETE)) needsj = 1; else needsj = 0; if (inodedep == NULL) if (inodedep_lookup(jaddref->ja_list.wk_mp, jaddref->ja_ino, 0, &inodedep) == 0) panic("cancel_jaddref: Lost inodedep"); /* * We must adjust the nlink of any reference operation that follows * us so that it is consistent with the in-memory reference. This * ensures that inode nlink rollbacks always have the correct link. */ if (needsj == 0) { for (inoref = TAILQ_NEXT(&jaddref->ja_ref, if_deps); inoref; inoref = TAILQ_NEXT(inoref, if_deps)) { if (inoref->if_state & GOINGAWAY) break; inoref->if_nlink--; } } jsegdep = inoref_jseg(&jaddref->ja_ref); if (jaddref->ja_state & NEWBLOCK) move_newblock_dep(jaddref, inodedep); wake_worklist(&jaddref->ja_list); jaddref->ja_mkdir = NULL; if (jaddref->ja_state & INPROGRESS) { jaddref->ja_state &= ~INPROGRESS; WORKLIST_REMOVE(&jaddref->ja_list); jwork_insert(wkhd, jsegdep); } else { free_jsegdep(jsegdep); if (jaddref->ja_state & DEPCOMPLETE) remove_from_journal(&jaddref->ja_list); } jaddref->ja_state |= (GOINGAWAY | DEPCOMPLETE); /* * Leave NEWBLOCK jaddrefs on the inodedep so handle_workitem_remove * can arrange for them to be freed with the bitmap. Otherwise we * no longer need this addref attached to the inoreflst and it * will incorrectly adjust nlink if we leave it. */ if ((jaddref->ja_state & NEWBLOCK) == 0) { TAILQ_REMOVE(&inodedep->id_inoreflst, &jaddref->ja_ref, if_deps); jaddref->ja_state |= COMPLETE; free_jaddref(jaddref); return (needsj); } /* * Leave the head of the list for jsegdeps for fast merging. */ if (LIST_FIRST(wkhd) != NULL) { jaddref->ja_state |= ONWORKLIST; LIST_INSERT_AFTER(LIST_FIRST(wkhd), &jaddref->ja_list, wk_list); } else WORKLIST_INSERT(wkhd, &jaddref->ja_list); return (needsj); } /* * Attempt to free a jaddref structure when some work completes. This * should only succeed once the entry is written and all dependencies have * been notified. */ static void free_jaddref(jaddref) struct jaddref *jaddref; { if ((jaddref->ja_state & ALLCOMPLETE) != ALLCOMPLETE) return; if (jaddref->ja_ref.if_jsegdep) panic("free_jaddref: segdep attached to jaddref %p(0x%X)\n", jaddref, jaddref->ja_state); if (jaddref->ja_state & NEWBLOCK) LIST_REMOVE(jaddref, ja_bmdeps); if (jaddref->ja_state & (INPROGRESS | ONWORKLIST)) panic("free_jaddref: Bad state %p(0x%X)", jaddref, jaddref->ja_state); if (jaddref->ja_mkdir != NULL) panic("free_jaddref: Work pending, 0x%X\n", jaddref->ja_state); WORKITEM_FREE(jaddref, D_JADDREF); } /* * Free a jremref structure once it has been written or discarded. */ static void free_jremref(jremref) struct jremref *jremref; { if (jremref->jr_ref.if_jsegdep) free_jsegdep(jremref->jr_ref.if_jsegdep); if (jremref->jr_state & INPROGRESS) panic("free_jremref: IO still pending"); WORKITEM_FREE(jremref, D_JREMREF); } /* * Free a jnewblk structure. */ static void free_jnewblk(jnewblk) struct jnewblk *jnewblk; { if ((jnewblk->jn_state & ALLCOMPLETE) != ALLCOMPLETE) return; LIST_REMOVE(jnewblk, jn_deps); if (jnewblk->jn_dep != NULL) panic("free_jnewblk: Dependency still attached."); WORKITEM_FREE(jnewblk, D_JNEWBLK); } /* * Cancel a jnewblk which has been been made redundant by frag extension. */ static void cancel_jnewblk(jnewblk, wkhd) struct jnewblk *jnewblk; struct workhead *wkhd; { struct jsegdep *jsegdep; CTR1(KTR_SUJ, "cancel_jnewblk: blkno %jd", jnewblk->jn_blkno); jsegdep = jnewblk->jn_jsegdep; if (jnewblk->jn_jsegdep == NULL || jnewblk->jn_dep == NULL) panic("cancel_jnewblk: Invalid state"); jnewblk->jn_jsegdep = NULL; jnewblk->jn_dep = NULL; jnewblk->jn_state |= GOINGAWAY; if (jnewblk->jn_state & INPROGRESS) { jnewblk->jn_state &= ~INPROGRESS; WORKLIST_REMOVE(&jnewblk->jn_list); jwork_insert(wkhd, jsegdep); } else { free_jsegdep(jsegdep); remove_from_journal(&jnewblk->jn_list); } wake_worklist(&jnewblk->jn_list); WORKLIST_INSERT(wkhd, &jnewblk->jn_list); } static void free_jblkdep(jblkdep) struct jblkdep *jblkdep; { if (jblkdep->jb_list.wk_type == D_JFREEBLK) WORKITEM_FREE(jblkdep, D_JFREEBLK); else if (jblkdep->jb_list.wk_type == D_JTRUNC) WORKITEM_FREE(jblkdep, D_JTRUNC); else panic("free_jblkdep: Unexpected type %s", TYPENAME(jblkdep->jb_list.wk_type)); } /* * Free a single jseg once it is no longer referenced in memory or on * disk. Reclaim journal blocks and dependencies waiting for the segment * to disappear. */ static void free_jseg(jseg, jblocks) struct jseg *jseg; struct jblocks *jblocks; { struct freework *freework; /* * Free freework structures that were lingering to indicate freed * indirect blocks that forced journal write ordering on reallocate. */ while ((freework = LIST_FIRST(&jseg->js_indirs)) != NULL) indirblk_remove(freework); if (jblocks->jb_oldestseg == jseg) jblocks->jb_oldestseg = TAILQ_NEXT(jseg, js_next); TAILQ_REMOVE(&jblocks->jb_segs, jseg, js_next); jblocks_free(jblocks, jseg->js_list.wk_mp, jseg->js_size); KASSERT(LIST_EMPTY(&jseg->js_entries), ("free_jseg: Freed jseg has valid entries.")); WORKITEM_FREE(jseg, D_JSEG); } /* * Free all jsegs that meet the criteria for being reclaimed and update * oldestseg. */ static void free_jsegs(jblocks) struct jblocks *jblocks; { struct jseg *jseg; /* * Free only those jsegs which have none allocated before them to * preserve the journal space ordering. */ while ((jseg = TAILQ_FIRST(&jblocks->jb_segs)) != NULL) { /* * Only reclaim space when nothing depends on this journal * set and another set has written that it is no longer * valid. */ if (jseg->js_refs != 0) { jblocks->jb_oldestseg = jseg; return; } if ((jseg->js_state & ALLCOMPLETE) != ALLCOMPLETE) break; if (jseg->js_seq > jblocks->jb_oldestwrseq) break; /* * We can free jsegs that didn't write entries when * oldestwrseq == js_seq. */ if (jseg->js_seq == jblocks->jb_oldestwrseq && jseg->js_cnt != 0) break; free_jseg(jseg, jblocks); } /* * If we exited the loop above we still must discover the * oldest valid segment. */ if (jseg) for (jseg = jblocks->jb_oldestseg; jseg != NULL; jseg = TAILQ_NEXT(jseg, js_next)) if (jseg->js_refs != 0) break; jblocks->jb_oldestseg = jseg; /* * The journal has no valid records but some jsegs may still be * waiting on oldestwrseq to advance. We force a small record * out to permit these lingering records to be reclaimed. */ if (jblocks->jb_oldestseg == NULL && !TAILQ_EMPTY(&jblocks->jb_segs)) jblocks->jb_needseg = 1; } /* * Release one reference to a jseg and free it if the count reaches 0. This * should eventually reclaim journal space as well. */ static void rele_jseg(jseg) struct jseg *jseg; { KASSERT(jseg->js_refs > 0, ("free_jseg: Invalid refcnt %d", jseg->js_refs)); if (--jseg->js_refs != 0) return; free_jsegs(jseg->js_jblocks); } /* * Release a jsegdep and decrement the jseg count. */ static void free_jsegdep(jsegdep) struct jsegdep *jsegdep; { if (jsegdep->jd_seg) rele_jseg(jsegdep->jd_seg); WORKITEM_FREE(jsegdep, D_JSEGDEP); } /* * Wait for a journal item to make it to disk. Initiate journal processing * if required. */ static int jwait(wk, waitfor) struct worklist *wk; int waitfor; { LOCK_OWNED(VFSTOUFS(wk->wk_mp)); /* * Blocking journal waits cause slow synchronous behavior. Record * stats on the frequency of these blocking operations. */ if (waitfor == MNT_WAIT) { stat_journal_wait++; switch (wk->wk_type) { case D_JREMREF: case D_JMVREF: stat_jwait_filepage++; break; case D_JTRUNC: case D_JFREEBLK: stat_jwait_freeblks++; break; case D_JNEWBLK: stat_jwait_newblk++; break; case D_JADDREF: stat_jwait_inode++; break; default: break; } } /* * If IO has not started we process the journal. We can't mark the * worklist item as IOWAITING because we drop the lock while * processing the journal and the worklist entry may be freed after * this point. The caller may call back in and re-issue the request. */ if ((wk->wk_state & INPROGRESS) == 0) { softdep_process_journal(wk->wk_mp, wk, waitfor); if (waitfor != MNT_WAIT) return (EBUSY); return (0); } if (waitfor != MNT_WAIT) return (EBUSY); wait_worklist(wk, "jwait"); return (0); } /* * Lookup an inodedep based on an inode pointer and set the nlinkdelta as * appropriate. This is a convenience function to reduce duplicate code * for the setup and revert functions below. */ static struct inodedep * inodedep_lookup_ip(ip) struct inode *ip; { struct inodedep *inodedep; KASSERT(ip->i_nlink >= ip->i_effnlink, ("inodedep_lookup_ip: bad delta")); (void) inodedep_lookup(ITOVFS(ip), ip->i_number, DEPALLOC, &inodedep); inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink; KASSERT((inodedep->id_state & UNLINKED) == 0, ("inode unlinked")); return (inodedep); } /* * Called prior to creating a new inode and linking it to a directory. The * jaddref structure must already be allocated by softdep_setup_inomapdep * and it is discovered here so we can initialize the mode and update * nlinkdelta. */ void softdep_setup_create(dp, ip) struct inode *dp; struct inode *ip; { struct inodedep *inodedep; struct jaddref *jaddref; struct vnode *dvp; KASSERT(MOUNTEDSOFTDEP(ITOVFS(dp)) != 0, ("softdep_setup_create called on non-softdep filesystem")); KASSERT(ip->i_nlink == 1, ("softdep_setup_create: Invalid link count.")); dvp = ITOV(dp); ACQUIRE_LOCK(ITOUMP(dp)); inodedep = inodedep_lookup_ip(ip); if (DOINGSUJ(dvp)) { jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst, inoreflst); KASSERT(jaddref != NULL && jaddref->ja_parent == dp->i_number, ("softdep_setup_create: No addref structure present.")); } softdep_prelink(dvp, NULL); FREE_LOCK(ITOUMP(dp)); } /* * Create a jaddref structure to track the addition of a DOTDOT link when * we are reparenting an inode as part of a rename. This jaddref will be * found by softdep_setup_directory_change. Adjusts nlinkdelta for * non-journaling softdep. */ void softdep_setup_dotdot_link(dp, ip) struct inode *dp; struct inode *ip; { struct inodedep *inodedep; struct jaddref *jaddref; struct vnode *dvp; KASSERT(MOUNTEDSOFTDEP(ITOVFS(dp)) != 0, ("softdep_setup_dotdot_link called on non-softdep filesystem")); dvp = ITOV(dp); jaddref = NULL; /* * We don't set MKDIR_PARENT as this is not tied to a mkdir and * is used as a normal link would be. */ if (DOINGSUJ(dvp)) jaddref = newjaddref(ip, dp->i_number, DOTDOT_OFFSET, dp->i_effnlink - 1, dp->i_mode); ACQUIRE_LOCK(ITOUMP(dp)); inodedep = inodedep_lookup_ip(dp); if (jaddref) TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, &jaddref->ja_ref, if_deps); softdep_prelink(dvp, ITOV(ip)); FREE_LOCK(ITOUMP(dp)); } /* * Create a jaddref structure to track a new link to an inode. The directory * offset is not known until softdep_setup_directory_add or * softdep_setup_directory_change. Adjusts nlinkdelta for non-journaling * softdep. */ void softdep_setup_link(dp, ip) struct inode *dp; struct inode *ip; { struct inodedep *inodedep; struct jaddref *jaddref; struct vnode *dvp; KASSERT(MOUNTEDSOFTDEP(ITOVFS(dp)) != 0, ("softdep_setup_link called on non-softdep filesystem")); dvp = ITOV(dp); jaddref = NULL; if (DOINGSUJ(dvp)) jaddref = newjaddref(dp, ip->i_number, 0, ip->i_effnlink - 1, ip->i_mode); ACQUIRE_LOCK(ITOUMP(dp)); inodedep = inodedep_lookup_ip(ip); if (jaddref) TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, &jaddref->ja_ref, if_deps); softdep_prelink(dvp, ITOV(ip)); FREE_LOCK(ITOUMP(dp)); } /* * Called to create the jaddref structures to track . and .. references as * well as lookup and further initialize the incomplete jaddref created * by softdep_setup_inomapdep when the inode was allocated. Adjusts * nlinkdelta for non-journaling softdep. */ void softdep_setup_mkdir(dp, ip) struct inode *dp; struct inode *ip; { struct inodedep *inodedep; struct jaddref *dotdotaddref; struct jaddref *dotaddref; struct jaddref *jaddref; struct vnode *dvp; KASSERT(MOUNTEDSOFTDEP(ITOVFS(dp)) != 0, ("softdep_setup_mkdir called on non-softdep filesystem")); dvp = ITOV(dp); dotaddref = dotdotaddref = NULL; if (DOINGSUJ(dvp)) { dotaddref = newjaddref(ip, ip->i_number, DOT_OFFSET, 1, ip->i_mode); dotaddref->ja_state |= MKDIR_BODY; dotdotaddref = newjaddref(ip, dp->i_number, DOTDOT_OFFSET, dp->i_effnlink - 1, dp->i_mode); dotdotaddref->ja_state |= MKDIR_PARENT; } ACQUIRE_LOCK(ITOUMP(dp)); inodedep = inodedep_lookup_ip(ip); if (DOINGSUJ(dvp)) { jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst, inoreflst); KASSERT(jaddref != NULL, ("softdep_setup_mkdir: No addref structure present.")); KASSERT(jaddref->ja_parent == dp->i_number, ("softdep_setup_mkdir: bad parent %ju", (uintmax_t)jaddref->ja_parent)); TAILQ_INSERT_BEFORE(&jaddref->ja_ref, &dotaddref->ja_ref, if_deps); } inodedep = inodedep_lookup_ip(dp); if (DOINGSUJ(dvp)) TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, &dotdotaddref->ja_ref, if_deps); softdep_prelink(ITOV(dp), NULL); FREE_LOCK(ITOUMP(dp)); } /* * Called to track nlinkdelta of the inode and parent directories prior to * unlinking a directory. */ void softdep_setup_rmdir(dp, ip) struct inode *dp; struct inode *ip; { struct vnode *dvp; KASSERT(MOUNTEDSOFTDEP(ITOVFS(dp)) != 0, ("softdep_setup_rmdir called on non-softdep filesystem")); dvp = ITOV(dp); ACQUIRE_LOCK(ITOUMP(dp)); (void) inodedep_lookup_ip(ip); (void) inodedep_lookup_ip(dp); softdep_prelink(dvp, ITOV(ip)); FREE_LOCK(ITOUMP(dp)); } /* * Called to track nlinkdelta of the inode and parent directories prior to * unlink. */ void softdep_setup_unlink(dp, ip) struct inode *dp; struct inode *ip; { struct vnode *dvp; KASSERT(MOUNTEDSOFTDEP(ITOVFS(dp)) != 0, ("softdep_setup_unlink called on non-softdep filesystem")); dvp = ITOV(dp); ACQUIRE_LOCK(ITOUMP(dp)); (void) inodedep_lookup_ip(ip); (void) inodedep_lookup_ip(dp); softdep_prelink(dvp, ITOV(ip)); FREE_LOCK(ITOUMP(dp)); } /* * Called to release the journal structures created by a failed non-directory * creation. Adjusts nlinkdelta for non-journaling softdep. */ void softdep_revert_create(dp, ip) struct inode *dp; struct inode *ip; { struct inodedep *inodedep; struct jaddref *jaddref; struct vnode *dvp; KASSERT(MOUNTEDSOFTDEP(ITOVFS((dp))) != 0, ("softdep_revert_create called on non-softdep filesystem")); dvp = ITOV(dp); ACQUIRE_LOCK(ITOUMP(dp)); inodedep = inodedep_lookup_ip(ip); if (DOINGSUJ(dvp)) { jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst, inoreflst); KASSERT(jaddref->ja_parent == dp->i_number, ("softdep_revert_create: addref parent mismatch")); cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait); } FREE_LOCK(ITOUMP(dp)); } /* * Called to release the journal structures created by a failed link * addition. Adjusts nlinkdelta for non-journaling softdep. */ void softdep_revert_link(dp, ip) struct inode *dp; struct inode *ip; { struct inodedep *inodedep; struct jaddref *jaddref; struct vnode *dvp; KASSERT(MOUNTEDSOFTDEP(ITOVFS(dp)) != 0, ("softdep_revert_link called on non-softdep filesystem")); dvp = ITOV(dp); ACQUIRE_LOCK(ITOUMP(dp)); inodedep = inodedep_lookup_ip(ip); if (DOINGSUJ(dvp)) { jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst, inoreflst); KASSERT(jaddref->ja_parent == dp->i_number, ("softdep_revert_link: addref parent mismatch")); cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait); } FREE_LOCK(ITOUMP(dp)); } /* * Called to release the journal structures created by a failed mkdir * attempt. Adjusts nlinkdelta for non-journaling softdep. */ void softdep_revert_mkdir(dp, ip) struct inode *dp; struct inode *ip; { struct inodedep *inodedep; struct jaddref *jaddref; struct jaddref *dotaddref; struct vnode *dvp; KASSERT(MOUNTEDSOFTDEP(ITOVFS(dp)) != 0, ("softdep_revert_mkdir called on non-softdep filesystem")); dvp = ITOV(dp); ACQUIRE_LOCK(ITOUMP(dp)); inodedep = inodedep_lookup_ip(dp); if (DOINGSUJ(dvp)) { jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst, inoreflst); KASSERT(jaddref->ja_parent == ip->i_number, ("softdep_revert_mkdir: dotdot addref parent mismatch")); cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait); } inodedep = inodedep_lookup_ip(ip); if (DOINGSUJ(dvp)) { jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst, inoreflst); KASSERT(jaddref->ja_parent == dp->i_number, ("softdep_revert_mkdir: addref parent mismatch")); dotaddref = (struct jaddref *)TAILQ_PREV(&jaddref->ja_ref, inoreflst, if_deps); cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait); KASSERT(dotaddref->ja_parent == ip->i_number, ("softdep_revert_mkdir: dot addref parent mismatch")); cancel_jaddref(dotaddref, inodedep, &inodedep->id_inowait); } FREE_LOCK(ITOUMP(dp)); } /* * Called to correct nlinkdelta after a failed rmdir. */ void softdep_revert_rmdir(dp, ip) struct inode *dp; struct inode *ip; { KASSERT(MOUNTEDSOFTDEP(ITOVFS(dp)) != 0, ("softdep_revert_rmdir called on non-softdep filesystem")); ACQUIRE_LOCK(ITOUMP(dp)); (void) inodedep_lookup_ip(ip); (void) inodedep_lookup_ip(dp); FREE_LOCK(ITOUMP(dp)); } /* * Protecting the freemaps (or bitmaps). * * To eliminate the need to execute fsck before mounting a filesystem * after a power failure, one must (conservatively) guarantee that the * on-disk copy of the bitmaps never indicate that a live inode or block is * free. So, when a block or inode is allocated, the bitmap should be * updated (on disk) before any new pointers. When a block or inode is * freed, the bitmap should not be updated until all pointers have been * reset. The latter dependency is handled by the delayed de-allocation * approach described below for block and inode de-allocation. The former * dependency is handled by calling the following procedure when a block or * inode is allocated. When an inode is allocated an "inodedep" is created * with its DEPCOMPLETE flag cleared until its bitmap is written to disk. * Each "inodedep" is also inserted into the hash indexing structure so * that any additional link additions can be made dependent on the inode * allocation. * * The ufs filesystem maintains a number of free block counts (e.g., per * cylinder group, per cylinder and per pair) * in addition to the bitmaps. These counts are used to improve efficiency * during allocation and therefore must be consistent with the bitmaps. * There is no convenient way to guarantee post-crash consistency of these * counts with simple update ordering, for two main reasons: (1) The counts * and bitmaps for a single cylinder group block are not in the same disk * sector. If a disk write is interrupted (e.g., by power failure), one may * be written and the other not. (2) Some of the counts are located in the * superblock rather than the cylinder group block. So, we focus our soft * updates implementation on protecting the bitmaps. When mounting a * filesystem, we recompute the auxiliary counts from the bitmaps. */ /* * Called just after updating the cylinder group block to allocate an inode. */ void softdep_setup_inomapdep(bp, ip, newinum, mode) struct buf *bp; /* buffer for cylgroup block with inode map */ struct inode *ip; /* inode related to allocation */ ino_t newinum; /* new inode number being allocated */ int mode; { struct inodedep *inodedep; struct bmsafemap *bmsafemap; struct jaddref *jaddref; struct mount *mp; struct fs *fs; mp = ITOVFS(ip); KASSERT(MOUNTEDSOFTDEP(mp) != 0, ("softdep_setup_inomapdep called on non-softdep filesystem")); fs = VFSTOUFS(mp)->um_fs; jaddref = NULL; /* * Allocate the journal reference add structure so that the bitmap * can be dependent on it. */ if (MOUNTEDSUJ(mp)) { jaddref = newjaddref(ip, newinum, 0, 0, mode); jaddref->ja_state |= NEWBLOCK; } /* * Create a dependency for the newly allocated inode. * Panic if it already exists as something is seriously wrong. * Otherwise add it to the dependency list for the buffer holding * the cylinder group map from which it was allocated. * * We have to preallocate a bmsafemap entry in case it is needed * in bmsafemap_lookup since once we allocate the inodedep, we * have to finish initializing it before we can FREE_LOCK(). * By preallocating, we avoid FREE_LOCK() while doing a malloc * in bmsafemap_lookup. We cannot call bmsafemap_lookup before * creating the inodedep as it can be freed during the time * that we FREE_LOCK() while allocating the inodedep. We must * call workitem_alloc() before entering the locked section as * it also acquires the lock and we must avoid trying doing so * recursively. */ bmsafemap = malloc(sizeof(struct bmsafemap), M_BMSAFEMAP, M_SOFTDEP_FLAGS); workitem_alloc(&bmsafemap->sm_list, D_BMSAFEMAP, mp); ACQUIRE_LOCK(ITOUMP(ip)); if ((inodedep_lookup(mp, newinum, DEPALLOC, &inodedep))) panic("softdep_setup_inomapdep: dependency %p for new" "inode already exists", inodedep); bmsafemap = bmsafemap_lookup(mp, bp, ino_to_cg(fs, newinum), bmsafemap); if (jaddref) { LIST_INSERT_HEAD(&bmsafemap->sm_jaddrefhd, jaddref, ja_bmdeps); TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, &jaddref->ja_ref, if_deps); } else { inodedep->id_state |= ONDEPLIST; LIST_INSERT_HEAD(&bmsafemap->sm_inodedephd, inodedep, id_deps); } inodedep->id_bmsafemap = bmsafemap; inodedep->id_state &= ~DEPCOMPLETE; FREE_LOCK(ITOUMP(ip)); } /* * Called just after updating the cylinder group block to * allocate block or fragment. */ void softdep_setup_blkmapdep(bp, mp, newblkno, frags, oldfrags) struct buf *bp; /* buffer for cylgroup block with block map */ struct mount *mp; /* filesystem doing allocation */ ufs2_daddr_t newblkno; /* number of newly allocated block */ int frags; /* Number of fragments. */ int oldfrags; /* Previous number of fragments for extend. */ { struct newblk *newblk; struct bmsafemap *bmsafemap; struct jnewblk *jnewblk; struct ufsmount *ump; struct fs *fs; KASSERT(MOUNTEDSOFTDEP(mp) != 0, ("softdep_setup_blkmapdep called on non-softdep filesystem")); ump = VFSTOUFS(mp); fs = ump->um_fs; jnewblk = NULL; /* * Create a dependency for the newly allocated block. * Add it to the dependency list for the buffer holding * the cylinder group map from which it was allocated. */ if (MOUNTEDSUJ(mp)) { jnewblk = malloc(sizeof(*jnewblk), M_JNEWBLK, M_SOFTDEP_FLAGS); workitem_alloc(&jnewblk->jn_list, D_JNEWBLK, mp); jnewblk->jn_jsegdep = newjsegdep(&jnewblk->jn_list); jnewblk->jn_state = ATTACHED; jnewblk->jn_blkno = newblkno; jnewblk->jn_frags = frags; jnewblk->jn_oldfrags = oldfrags; #ifdef INVARIANTS { struct cg *cgp; uint8_t *blksfree; long bno; int i; cgp = (struct cg *)bp->b_data; blksfree = cg_blksfree(cgp); bno = dtogd(fs, jnewblk->jn_blkno); for (i = jnewblk->jn_oldfrags; i < jnewblk->jn_frags; i++) { if (isset(blksfree, bno + i)) panic("softdep_setup_blkmapdep: " "free fragment %d from %d-%d " "state 0x%X dep %p", i, jnewblk->jn_oldfrags, jnewblk->jn_frags, jnewblk->jn_state, jnewblk->jn_dep); } } #endif } CTR3(KTR_SUJ, "softdep_setup_blkmapdep: blkno %jd frags %d oldfrags %d", newblkno, frags, oldfrags); ACQUIRE_LOCK(ump); if (newblk_lookup(mp, newblkno, DEPALLOC, &newblk) != 0) panic("softdep_setup_blkmapdep: found block"); newblk->nb_bmsafemap = bmsafemap = bmsafemap_lookup(mp, bp, dtog(fs, newblkno), NULL); if (jnewblk) { jnewblk->jn_dep = (struct worklist *)newblk; LIST_INSERT_HEAD(&bmsafemap->sm_jnewblkhd, jnewblk, jn_deps); } else { newblk->nb_state |= ONDEPLIST; LIST_INSERT_HEAD(&bmsafemap->sm_newblkhd, newblk, nb_deps); } newblk->nb_bmsafemap = bmsafemap; newblk->nb_jnewblk = jnewblk; FREE_LOCK(ump); } #define BMSAFEMAP_HASH(ump, cg) \ (&(ump)->bmsafemap_hashtbl[(cg) & (ump)->bmsafemap_hash_size]) static int bmsafemap_find(bmsafemaphd, cg, bmsafemapp) struct bmsafemap_hashhead *bmsafemaphd; int cg; struct bmsafemap **bmsafemapp; { struct bmsafemap *bmsafemap; LIST_FOREACH(bmsafemap, bmsafemaphd, sm_hash) if (bmsafemap->sm_cg == cg) break; if (bmsafemap) { *bmsafemapp = bmsafemap; return (1); } *bmsafemapp = NULL; return (0); } /* * Find the bmsafemap associated with a cylinder group buffer. * If none exists, create one. The buffer must be locked when * this routine is called and this routine must be called with * the softdep lock held. To avoid giving up the lock while * allocating a new bmsafemap, a preallocated bmsafemap may be * provided. If it is provided but not needed, it is freed. */ static struct bmsafemap * bmsafemap_lookup(mp, bp, cg, newbmsafemap) struct mount *mp; struct buf *bp; int cg; struct bmsafemap *newbmsafemap; { struct bmsafemap_hashhead *bmsafemaphd; struct bmsafemap *bmsafemap, *collision; struct worklist *wk; struct ufsmount *ump; ump = VFSTOUFS(mp); LOCK_OWNED(ump); KASSERT(bp != NULL, ("bmsafemap_lookup: missing buffer")); LIST_FOREACH(wk, &bp->b_dep, wk_list) { if (wk->wk_type == D_BMSAFEMAP) { if (newbmsafemap) WORKITEM_FREE(newbmsafemap, D_BMSAFEMAP); return (WK_BMSAFEMAP(wk)); } } bmsafemaphd = BMSAFEMAP_HASH(ump, cg); if (bmsafemap_find(bmsafemaphd, cg, &bmsafemap) == 1) { if (newbmsafemap) WORKITEM_FREE(newbmsafemap, D_BMSAFEMAP); return (bmsafemap); } if (newbmsafemap) { bmsafemap = newbmsafemap; } else { FREE_LOCK(ump); bmsafemap = malloc(sizeof(struct bmsafemap), M_BMSAFEMAP, M_SOFTDEP_FLAGS); workitem_alloc(&bmsafemap->sm_list, D_BMSAFEMAP, mp); ACQUIRE_LOCK(ump); } bmsafemap->sm_buf = bp; LIST_INIT(&bmsafemap->sm_inodedephd); LIST_INIT(&bmsafemap->sm_inodedepwr); LIST_INIT(&bmsafemap->sm_newblkhd); LIST_INIT(&bmsafemap->sm_newblkwr); LIST_INIT(&bmsafemap->sm_jaddrefhd); LIST_INIT(&bmsafemap->sm_jnewblkhd); LIST_INIT(&bmsafemap->sm_freehd); LIST_INIT(&bmsafemap->sm_freewr); if (bmsafemap_find(bmsafemaphd, cg, &collision) == 1) { WORKITEM_FREE(bmsafemap, D_BMSAFEMAP); return (collision); } bmsafemap->sm_cg = cg; LIST_INSERT_HEAD(bmsafemaphd, bmsafemap, sm_hash); LIST_INSERT_HEAD(&ump->softdep_dirtycg, bmsafemap, sm_next); WORKLIST_INSERT(&bp->b_dep, &bmsafemap->sm_list); return (bmsafemap); } /* * Direct block allocation dependencies. * * When a new block is allocated, the corresponding disk locations must be * initialized (with zeros or new data) before the on-disk inode points to * them. Also, the freemap from which the block was allocated must be * updated (on disk) before the inode's pointer. These two dependencies are * independent of each other and are needed for all file blocks and indirect * blocks that are pointed to directly by the inode. Just before the * "in-core" version of the inode is updated with a newly allocated block * number, a procedure (below) is called to setup allocation dependency * structures. These structures are removed when the corresponding * dependencies are satisfied or when the block allocation becomes obsolete * (i.e., the file is deleted, the block is de-allocated, or the block is a * fragment that gets upgraded). All of these cases are handled in * procedures described later. * * When a file extension causes a fragment to be upgraded, either to a larger * fragment or to a full block, the on-disk location may change (if the * previous fragment could not simply be extended). In this case, the old * fragment must be de-allocated, but not until after the inode's pointer has * been updated. In most cases, this is handled by later procedures, which * will construct a "freefrag" structure to be added to the workitem queue * when the inode update is complete (or obsolete). The main exception to * this is when an allocation occurs while a pending allocation dependency * (for the same block pointer) remains. This case is handled in the main * allocation dependency setup procedure by immediately freeing the * unreferenced fragments. */ void softdep_setup_allocdirect(ip, off, newblkno, oldblkno, newsize, oldsize, bp) struct inode *ip; /* inode to which block is being added */ ufs_lbn_t off; /* block pointer within inode */ ufs2_daddr_t newblkno; /* disk block number being added */ ufs2_daddr_t oldblkno; /* previous block number, 0 unless frag */ long newsize; /* size of new block */ long oldsize; /* size of new block */ struct buf *bp; /* bp for allocated block */ { struct allocdirect *adp, *oldadp; struct allocdirectlst *adphead; struct freefrag *freefrag; struct inodedep *inodedep; struct pagedep *pagedep; struct jnewblk *jnewblk; struct newblk *newblk; struct mount *mp; ufs_lbn_t lbn; lbn = bp->b_lblkno; mp = ITOVFS(ip); KASSERT(MOUNTEDSOFTDEP(mp) != 0, ("softdep_setup_allocdirect called on non-softdep filesystem")); if (oldblkno && oldblkno != newblkno) /* * The usual case is that a smaller fragment that * was just allocated has been replaced with a bigger * fragment or a full-size block. If it is marked as * B_DELWRI, the current contents have not been written * to disk. It is possible that the block was written * earlier, but very uncommon. If the block has never * been written, there is no need to send a BIO_DELETE * for it when it is freed. The gain from avoiding the * TRIMs for the common case of unwritten blocks far * exceeds the cost of the write amplification for the * uncommon case of failing to send a TRIM for a block * that had been written. */ freefrag = newfreefrag(ip, oldblkno, oldsize, lbn, (bp->b_flags & B_DELWRI) != 0 ? NOTRIM_KEY : SINGLETON_KEY); else freefrag = NULL; CTR6(KTR_SUJ, "softdep_setup_allocdirect: ino %d blkno %jd oldblkno %jd " "off %jd newsize %ld oldsize %d", ip->i_number, newblkno, oldblkno, off, newsize, oldsize); ACQUIRE_LOCK(ITOUMP(ip)); if (off >= UFS_NDADDR) { if (lbn > 0) panic("softdep_setup_allocdirect: bad lbn %jd, off %jd", lbn, off); /* allocating an indirect block */ if (oldblkno != 0) panic("softdep_setup_allocdirect: non-zero indir"); } else { if (off != lbn) panic("softdep_setup_allocdirect: lbn %jd != off %jd", lbn, off); /* * Allocating a direct block. * * If we are allocating a directory block, then we must * allocate an associated pagedep to track additions and * deletions. */ if ((ip->i_mode & IFMT) == IFDIR) pagedep_lookup(mp, bp, ip->i_number, off, DEPALLOC, &pagedep); } if (newblk_lookup(mp, newblkno, 0, &newblk) == 0) panic("softdep_setup_allocdirect: lost block"); KASSERT(newblk->nb_list.wk_type == D_NEWBLK, ("softdep_setup_allocdirect: newblk already initialized")); /* * Convert the newblk to an allocdirect. */ WORKITEM_REASSIGN(newblk, D_ALLOCDIRECT); adp = (struct allocdirect *)newblk; newblk->nb_freefrag = freefrag; adp->ad_offset = off; adp->ad_oldblkno = oldblkno; adp->ad_newsize = newsize; adp->ad_oldsize = oldsize; /* * Finish initializing the journal. */ if ((jnewblk = newblk->nb_jnewblk) != NULL) { jnewblk->jn_ino = ip->i_number; jnewblk->jn_lbn = lbn; add_to_journal(&jnewblk->jn_list); } if (freefrag && freefrag->ff_jdep != NULL && freefrag->ff_jdep->wk_type == D_JFREEFRAG) add_to_journal(freefrag->ff_jdep); inodedep_lookup(mp, ip->i_number, DEPALLOC, &inodedep); adp->ad_inodedep = inodedep; WORKLIST_INSERT(&bp->b_dep, &newblk->nb_list); /* * The list of allocdirects must be kept in sorted and ascending * order so that the rollback routines can quickly determine the * first uncommitted block (the size of the file stored on disk * ends at the end of the lowest committed fragment, or if there * are no fragments, at the end of the highest committed block). * Since files generally grow, the typical case is that the new * block is to be added at the end of the list. We speed this * special case by checking against the last allocdirect in the * list before laboriously traversing the list looking for the * insertion point. */ adphead = &inodedep->id_newinoupdt; oldadp = TAILQ_LAST(adphead, allocdirectlst); if (oldadp == NULL || oldadp->ad_offset <= off) { /* insert at end of list */ TAILQ_INSERT_TAIL(adphead, adp, ad_next); if (oldadp != NULL && oldadp->ad_offset == off) allocdirect_merge(adphead, adp, oldadp); FREE_LOCK(ITOUMP(ip)); return; } TAILQ_FOREACH(oldadp, adphead, ad_next) { if (oldadp->ad_offset >= off) break; } if (oldadp == NULL) panic("softdep_setup_allocdirect: lost entry"); /* insert in middle of list */ TAILQ_INSERT_BEFORE(oldadp, adp, ad_next); if (oldadp->ad_offset == off) allocdirect_merge(adphead, adp, oldadp); FREE_LOCK(ITOUMP(ip)); } /* * Merge a newer and older journal record to be stored either in a * newblock or freefrag. This handles aggregating journal records for * fragment allocation into a second record as well as replacing a * journal free with an aborted journal allocation. A segment for the * oldest record will be placed on wkhd if it has been written. If not * the segment for the newer record will suffice. */ static struct worklist * jnewblk_merge(new, old, wkhd) struct worklist *new; struct worklist *old; struct workhead *wkhd; { struct jnewblk *njnewblk; struct jnewblk *jnewblk; /* Handle NULLs to simplify callers. */ if (new == NULL) return (old); if (old == NULL) return (new); /* Replace a jfreefrag with a jnewblk. */ if (new->wk_type == D_JFREEFRAG) { if (WK_JNEWBLK(old)->jn_blkno != WK_JFREEFRAG(new)->fr_blkno) panic("jnewblk_merge: blkno mismatch: %p, %p", old, new); cancel_jfreefrag(WK_JFREEFRAG(new)); return (old); } if (old->wk_type != D_JNEWBLK || new->wk_type != D_JNEWBLK) panic("jnewblk_merge: Bad type: old %d new %d\n", old->wk_type, new->wk_type); /* * Handle merging of two jnewblk records that describe * different sets of fragments in the same block. */ jnewblk = WK_JNEWBLK(old); njnewblk = WK_JNEWBLK(new); if (jnewblk->jn_blkno != njnewblk->jn_blkno) panic("jnewblk_merge: Merging disparate blocks."); /* * The record may be rolled back in the cg. */ if (jnewblk->jn_state & UNDONE) { jnewblk->jn_state &= ~UNDONE; njnewblk->jn_state |= UNDONE; njnewblk->jn_state &= ~ATTACHED; } /* * We modify the newer addref and free the older so that if neither * has been written the most up-to-date copy will be on disk. If * both have been written but rolled back we only temporarily need * one of them to fix the bits when the cg write completes. */ jnewblk->jn_state |= ATTACHED | COMPLETE; njnewblk->jn_oldfrags = jnewblk->jn_oldfrags; cancel_jnewblk(jnewblk, wkhd); WORKLIST_REMOVE(&jnewblk->jn_list); free_jnewblk(jnewblk); return (new); } /* * Replace an old allocdirect dependency with a newer one. */ static void allocdirect_merge(adphead, newadp, oldadp) struct allocdirectlst *adphead; /* head of list holding allocdirects */ struct allocdirect *newadp; /* allocdirect being added */ struct allocdirect *oldadp; /* existing allocdirect being checked */ { struct worklist *wk; struct freefrag *freefrag; freefrag = NULL; LOCK_OWNED(VFSTOUFS(newadp->ad_list.wk_mp)); if (newadp->ad_oldblkno != oldadp->ad_newblkno || newadp->ad_oldsize != oldadp->ad_newsize || newadp->ad_offset >= UFS_NDADDR) panic("%s %jd != new %jd || old size %ld != new %ld", "allocdirect_merge: old blkno", (intmax_t)newadp->ad_oldblkno, (intmax_t)oldadp->ad_newblkno, newadp->ad_oldsize, oldadp->ad_newsize); newadp->ad_oldblkno = oldadp->ad_oldblkno; newadp->ad_oldsize = oldadp->ad_oldsize; /* * If the old dependency had a fragment to free or had never * previously had a block allocated, then the new dependency * can immediately post its freefrag and adopt the old freefrag. * This action is done by swapping the freefrag dependencies. * The new dependency gains the old one's freefrag, and the * old one gets the new one and then immediately puts it on * the worklist when it is freed by free_newblk. It is * not possible to do this swap when the old dependency had a * non-zero size but no previous fragment to free. This condition * arises when the new block is an extension of the old block. * Here, the first part of the fragment allocated to the new * dependency is part of the block currently claimed on disk by * the old dependency, so cannot legitimately be freed until the * conditions for the new dependency are fulfilled. */ freefrag = newadp->ad_freefrag; if (oldadp->ad_freefrag != NULL || oldadp->ad_oldblkno == 0) { newadp->ad_freefrag = oldadp->ad_freefrag; oldadp->ad_freefrag = freefrag; } /* * If we are tracking a new directory-block allocation, * move it from the old allocdirect to the new allocdirect. */ if ((wk = LIST_FIRST(&oldadp->ad_newdirblk)) != NULL) { WORKLIST_REMOVE(wk); if (!LIST_EMPTY(&oldadp->ad_newdirblk)) panic("allocdirect_merge: extra newdirblk"); WORKLIST_INSERT(&newadp->ad_newdirblk, wk); } TAILQ_REMOVE(adphead, oldadp, ad_next); /* * We need to move any journal dependencies over to the freefrag * that releases this block if it exists. Otherwise we are * extending an existing block and we'll wait until that is * complete to release the journal space and extend the * new journal to cover this old space as well. */ if (freefrag == NULL) { if (oldadp->ad_newblkno != newadp->ad_newblkno) panic("allocdirect_merge: %jd != %jd", oldadp->ad_newblkno, newadp->ad_newblkno); newadp->ad_block.nb_jnewblk = (struct jnewblk *) jnewblk_merge(&newadp->ad_block.nb_jnewblk->jn_list, &oldadp->ad_block.nb_jnewblk->jn_list, &newadp->ad_block.nb_jwork); oldadp->ad_block.nb_jnewblk = NULL; cancel_newblk(&oldadp->ad_block, NULL, &newadp->ad_block.nb_jwork); } else { wk = (struct worklist *) cancel_newblk(&oldadp->ad_block, &freefrag->ff_list, &freefrag->ff_jwork); freefrag->ff_jdep = jnewblk_merge(freefrag->ff_jdep, wk, &freefrag->ff_jwork); } free_newblk(&oldadp->ad_block); } /* * Allocate a jfreefrag structure to journal a single block free. */ static struct jfreefrag * newjfreefrag(freefrag, ip, blkno, size, lbn) struct freefrag *freefrag; struct inode *ip; ufs2_daddr_t blkno; long size; ufs_lbn_t lbn; { struct jfreefrag *jfreefrag; struct fs *fs; fs = ITOFS(ip); jfreefrag = malloc(sizeof(struct jfreefrag), M_JFREEFRAG, M_SOFTDEP_FLAGS); workitem_alloc(&jfreefrag->fr_list, D_JFREEFRAG, ITOVFS(ip)); jfreefrag->fr_jsegdep = newjsegdep(&jfreefrag->fr_list); jfreefrag->fr_state = ATTACHED | DEPCOMPLETE; jfreefrag->fr_ino = ip->i_number; jfreefrag->fr_lbn = lbn; jfreefrag->fr_blkno = blkno; jfreefrag->fr_frags = numfrags(fs, size); jfreefrag->fr_freefrag = freefrag; return (jfreefrag); } /* * Allocate a new freefrag structure. */ static struct freefrag * newfreefrag(ip, blkno, size, lbn, key) struct inode *ip; ufs2_daddr_t blkno; long size; ufs_lbn_t lbn; u_long key; { struct freefrag *freefrag; struct ufsmount *ump; struct fs *fs; CTR4(KTR_SUJ, "newfreefrag: ino %d blkno %jd size %ld lbn %jd", ip->i_number, blkno, size, lbn); ump = ITOUMP(ip); fs = ump->um_fs; if (fragnum(fs, blkno) + numfrags(fs, size) > fs->fs_frag) panic("newfreefrag: frag size"); freefrag = malloc(sizeof(struct freefrag), M_FREEFRAG, M_SOFTDEP_FLAGS); workitem_alloc(&freefrag->ff_list, D_FREEFRAG, UFSTOVFS(ump)); freefrag->ff_state = ATTACHED; LIST_INIT(&freefrag->ff_jwork); freefrag->ff_inum = ip->i_number; freefrag->ff_vtype = ITOV(ip)->v_type; freefrag->ff_blkno = blkno; freefrag->ff_fragsize = size; freefrag->ff_key = key; if (MOUNTEDSUJ(UFSTOVFS(ump))) { freefrag->ff_jdep = (struct worklist *) newjfreefrag(freefrag, ip, blkno, size, lbn); } else { freefrag->ff_state |= DEPCOMPLETE; freefrag->ff_jdep = NULL; } return (freefrag); } /* * This workitem de-allocates fragments that were replaced during * file block allocation. */ static void handle_workitem_freefrag(freefrag) struct freefrag *freefrag; { struct ufsmount *ump = VFSTOUFS(freefrag->ff_list.wk_mp); struct workhead wkhd; CTR3(KTR_SUJ, "handle_workitem_freefrag: ino %d blkno %jd size %ld", freefrag->ff_inum, freefrag->ff_blkno, freefrag->ff_fragsize); /* * It would be illegal to add new completion items to the * freefrag after it was schedule to be done so it must be * safe to modify the list head here. */ LIST_INIT(&wkhd); ACQUIRE_LOCK(ump); LIST_SWAP(&freefrag->ff_jwork, &wkhd, worklist, wk_list); /* * If the journal has not been written we must cancel it here. */ if (freefrag->ff_jdep) { if (freefrag->ff_jdep->wk_type != D_JNEWBLK) panic("handle_workitem_freefrag: Unexpected type %d\n", freefrag->ff_jdep->wk_type); cancel_jnewblk(WK_JNEWBLK(freefrag->ff_jdep), &wkhd); } FREE_LOCK(ump); ffs_blkfree(ump, ump->um_fs, ump->um_devvp, freefrag->ff_blkno, freefrag->ff_fragsize, freefrag->ff_inum, freefrag->ff_vtype, &wkhd, freefrag->ff_key); ACQUIRE_LOCK(ump); WORKITEM_FREE(freefrag, D_FREEFRAG); FREE_LOCK(ump); } /* * Set up a dependency structure for an external attributes data block. * This routine follows much of the structure of softdep_setup_allocdirect. * See the description of softdep_setup_allocdirect above for details. */ void softdep_setup_allocext(ip, off, newblkno, oldblkno, newsize, oldsize, bp) struct inode *ip; ufs_lbn_t off; ufs2_daddr_t newblkno; ufs2_daddr_t oldblkno; long newsize; long oldsize; struct buf *bp; { struct allocdirect *adp, *oldadp; struct allocdirectlst *adphead; struct freefrag *freefrag; struct inodedep *inodedep; struct jnewblk *jnewblk; struct newblk *newblk; struct mount *mp; struct ufsmount *ump; ufs_lbn_t lbn; mp = ITOVFS(ip); ump = VFSTOUFS(mp); KASSERT(MOUNTEDSOFTDEP(mp) != 0, ("softdep_setup_allocext called on non-softdep filesystem")); KASSERT(off < UFS_NXADDR, ("softdep_setup_allocext: lbn %lld > UFS_NXADDR", (long long)off)); lbn = bp->b_lblkno; if (oldblkno && oldblkno != newblkno) /* * The usual case is that a smaller fragment that * was just allocated has been replaced with a bigger * fragment or a full-size block. If it is marked as * B_DELWRI, the current contents have not been written * to disk. It is possible that the block was written * earlier, but very uncommon. If the block has never * been written, there is no need to send a BIO_DELETE * for it when it is freed. The gain from avoiding the * TRIMs for the common case of unwritten blocks far * exceeds the cost of the write amplification for the * uncommon case of failing to send a TRIM for a block * that had been written. */ freefrag = newfreefrag(ip, oldblkno, oldsize, lbn, (bp->b_flags & B_DELWRI) != 0 ? NOTRIM_KEY : SINGLETON_KEY); else freefrag = NULL; ACQUIRE_LOCK(ump); if (newblk_lookup(mp, newblkno, 0, &newblk) == 0) panic("softdep_setup_allocext: lost block"); KASSERT(newblk->nb_list.wk_type == D_NEWBLK, ("softdep_setup_allocext: newblk already initialized")); /* * Convert the newblk to an allocdirect. */ WORKITEM_REASSIGN(newblk, D_ALLOCDIRECT); adp = (struct allocdirect *)newblk; newblk->nb_freefrag = freefrag; adp->ad_offset = off; adp->ad_oldblkno = oldblkno; adp->ad_newsize = newsize; adp->ad_oldsize = oldsize; adp->ad_state |= EXTDATA; /* * Finish initializing the journal. */ if ((jnewblk = newblk->nb_jnewblk) != NULL) { jnewblk->jn_ino = ip->i_number; jnewblk->jn_lbn = lbn; add_to_journal(&jnewblk->jn_list); } if (freefrag && freefrag->ff_jdep != NULL && freefrag->ff_jdep->wk_type == D_JFREEFRAG) add_to_journal(freefrag->ff_jdep); inodedep_lookup(mp, ip->i_number, DEPALLOC, &inodedep); adp->ad_inodedep = inodedep; WORKLIST_INSERT(&bp->b_dep, &newblk->nb_list); /* * The list of allocdirects must be kept in sorted and ascending * order so that the rollback routines can quickly determine the * first uncommitted block (the size of the file stored on disk * ends at the end of the lowest committed fragment, or if there * are no fragments, at the end of the highest committed block). * Since files generally grow, the typical case is that the new * block is to be added at the end of the list. We speed this * special case by checking against the last allocdirect in the * list before laboriously traversing the list looking for the * insertion point. */ adphead = &inodedep->id_newextupdt; oldadp = TAILQ_LAST(adphead, allocdirectlst); if (oldadp == NULL || oldadp->ad_offset <= off) { /* insert at end of list */ TAILQ_INSERT_TAIL(adphead, adp, ad_next); if (oldadp != NULL && oldadp->ad_offset == off) allocdirect_merge(adphead, adp, oldadp); FREE_LOCK(ump); return; } TAILQ_FOREACH(oldadp, adphead, ad_next) { if (oldadp->ad_offset >= off) break; } if (oldadp == NULL) panic("softdep_setup_allocext: lost entry"); /* insert in middle of list */ TAILQ_INSERT_BEFORE(oldadp, adp, ad_next); if (oldadp->ad_offset == off) allocdirect_merge(adphead, adp, oldadp); FREE_LOCK(ump); } /* * Indirect block allocation dependencies. * * The same dependencies that exist for a direct block also exist when * a new block is allocated and pointed to by an entry in a block of * indirect pointers. The undo/redo states described above are also * used here. Because an indirect block contains many pointers that * may have dependencies, a second copy of the entire in-memory indirect * block is kept. The buffer cache copy is always completely up-to-date. * The second copy, which is used only as a source for disk writes, * contains only the safe pointers (i.e., those that have no remaining * update dependencies). The second copy is freed when all pointers * are safe. The cache is not allowed to replace indirect blocks with * pending update dependencies. If a buffer containing an indirect * block with dependencies is written, these routines will mark it * dirty again. It can only be successfully written once all the * dependencies are removed. The ffs_fsync routine in conjunction with * softdep_sync_metadata work together to get all the dependencies * removed so that a file can be successfully written to disk. Three * procedures are used when setting up indirect block pointer * dependencies. The division is necessary because of the organization * of the "balloc" routine and because of the distinction between file * pages and file metadata blocks. */ /* * Allocate a new allocindir structure. */ static struct allocindir * newallocindir(ip, ptrno, newblkno, oldblkno, lbn) struct inode *ip; /* inode for file being extended */ int ptrno; /* offset of pointer in indirect block */ ufs2_daddr_t newblkno; /* disk block number being added */ ufs2_daddr_t oldblkno; /* previous block number, 0 if none */ ufs_lbn_t lbn; { struct newblk *newblk; struct allocindir *aip; struct freefrag *freefrag; struct jnewblk *jnewblk; if (oldblkno) freefrag = newfreefrag(ip, oldblkno, ITOFS(ip)->fs_bsize, lbn, SINGLETON_KEY); else freefrag = NULL; ACQUIRE_LOCK(ITOUMP(ip)); if (newblk_lookup(ITOVFS(ip), newblkno, 0, &newblk) == 0) panic("new_allocindir: lost block"); KASSERT(newblk->nb_list.wk_type == D_NEWBLK, ("newallocindir: newblk already initialized")); WORKITEM_REASSIGN(newblk, D_ALLOCINDIR); newblk->nb_freefrag = freefrag; aip = (struct allocindir *)newblk; aip->ai_offset = ptrno; aip->ai_oldblkno = oldblkno; aip->ai_lbn = lbn; if ((jnewblk = newblk->nb_jnewblk) != NULL) { jnewblk->jn_ino = ip->i_number; jnewblk->jn_lbn = lbn; add_to_journal(&jnewblk->jn_list); } if (freefrag && freefrag->ff_jdep != NULL && freefrag->ff_jdep->wk_type == D_JFREEFRAG) add_to_journal(freefrag->ff_jdep); return (aip); } /* * Called just before setting an indirect block pointer * to a newly allocated file page. */ void softdep_setup_allocindir_page(ip, lbn, bp, ptrno, newblkno, oldblkno, nbp) struct inode *ip; /* inode for file being extended */ ufs_lbn_t lbn; /* allocated block number within file */ struct buf *bp; /* buffer with indirect blk referencing page */ int ptrno; /* offset of pointer in indirect block */ ufs2_daddr_t newblkno; /* disk block number being added */ ufs2_daddr_t oldblkno; /* previous block number, 0 if none */ struct buf *nbp; /* buffer holding allocated page */ { struct inodedep *inodedep; struct freefrag *freefrag; struct allocindir *aip; struct pagedep *pagedep; struct mount *mp; struct ufsmount *ump; mp = ITOVFS(ip); ump = VFSTOUFS(mp); KASSERT(MOUNTEDSOFTDEP(mp) != 0, ("softdep_setup_allocindir_page called on non-softdep filesystem")); KASSERT(lbn == nbp->b_lblkno, ("softdep_setup_allocindir_page: lbn %jd != lblkno %jd", lbn, bp->b_lblkno)); CTR4(KTR_SUJ, "softdep_setup_allocindir_page: ino %d blkno %jd oldblkno %jd " "lbn %jd", ip->i_number, newblkno, oldblkno, lbn); ASSERT_VOP_LOCKED(ITOV(ip), "softdep_setup_allocindir_page"); aip = newallocindir(ip, ptrno, newblkno, oldblkno, lbn); (void) inodedep_lookup(mp, ip->i_number, DEPALLOC, &inodedep); /* * If we are allocating a directory page, then we must * allocate an associated pagedep to track additions and * deletions. */ if ((ip->i_mode & IFMT) == IFDIR) pagedep_lookup(mp, nbp, ip->i_number, lbn, DEPALLOC, &pagedep); WORKLIST_INSERT(&nbp->b_dep, &aip->ai_block.nb_list); freefrag = setup_allocindir_phase2(bp, ip, inodedep, aip, lbn); FREE_LOCK(ump); if (freefrag) handle_workitem_freefrag(freefrag); } /* * Called just before setting an indirect block pointer to a * newly allocated indirect block. */ void softdep_setup_allocindir_meta(nbp, ip, bp, ptrno, newblkno) struct buf *nbp; /* newly allocated indirect block */ struct inode *ip; /* inode for file being extended */ struct buf *bp; /* indirect block referencing allocated block */ int ptrno; /* offset of pointer in indirect block */ ufs2_daddr_t newblkno; /* disk block number being added */ { struct inodedep *inodedep; struct allocindir *aip; struct ufsmount *ump; ufs_lbn_t lbn; ump = ITOUMP(ip); KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(ump)) != 0, ("softdep_setup_allocindir_meta called on non-softdep filesystem")); CTR3(KTR_SUJ, "softdep_setup_allocindir_meta: ino %d blkno %jd ptrno %d", ip->i_number, newblkno, ptrno); lbn = nbp->b_lblkno; ASSERT_VOP_LOCKED(ITOV(ip), "softdep_setup_allocindir_meta"); aip = newallocindir(ip, ptrno, newblkno, 0, lbn); inodedep_lookup(UFSTOVFS(ump), ip->i_number, DEPALLOC, &inodedep); WORKLIST_INSERT(&nbp->b_dep, &aip->ai_block.nb_list); if (setup_allocindir_phase2(bp, ip, inodedep, aip, lbn)) panic("softdep_setup_allocindir_meta: Block already existed"); FREE_LOCK(ump); } static void indirdep_complete(indirdep) struct indirdep *indirdep; { struct allocindir *aip; LIST_REMOVE(indirdep, ir_next); indirdep->ir_state |= DEPCOMPLETE; while ((aip = LIST_FIRST(&indirdep->ir_completehd)) != NULL) { LIST_REMOVE(aip, ai_next); free_newblk(&aip->ai_block); } /* * If this indirdep is not attached to a buf it was simply waiting * on completion to clear completehd. free_indirdep() asserts * that nothing is dangling. */ if ((indirdep->ir_state & ONWORKLIST) == 0) free_indirdep(indirdep); } static struct indirdep * indirdep_lookup(mp, ip, bp) struct mount *mp; struct inode *ip; struct buf *bp; { struct indirdep *indirdep, *newindirdep; struct newblk *newblk; struct ufsmount *ump; struct worklist *wk; struct fs *fs; ufs2_daddr_t blkno; ump = VFSTOUFS(mp); LOCK_OWNED(ump); indirdep = NULL; newindirdep = NULL; fs = ump->um_fs; for (;;) { LIST_FOREACH(wk, &bp->b_dep, wk_list) { if (wk->wk_type != D_INDIRDEP) continue; indirdep = WK_INDIRDEP(wk); break; } /* Found on the buffer worklist, no new structure to free. */ if (indirdep != NULL && newindirdep == NULL) return (indirdep); if (indirdep != NULL && newindirdep != NULL) panic("indirdep_lookup: simultaneous create"); /* None found on the buffer and a new structure is ready. */ if (indirdep == NULL && newindirdep != NULL) break; /* None found and no new structure available. */ FREE_LOCK(ump); newindirdep = malloc(sizeof(struct indirdep), M_INDIRDEP, M_SOFTDEP_FLAGS); workitem_alloc(&newindirdep->ir_list, D_INDIRDEP, mp); newindirdep->ir_state = ATTACHED; if (I_IS_UFS1(ip)) newindirdep->ir_state |= UFS1FMT; TAILQ_INIT(&newindirdep->ir_trunc); newindirdep->ir_saveddata = NULL; LIST_INIT(&newindirdep->ir_deplisthd); LIST_INIT(&newindirdep->ir_donehd); LIST_INIT(&newindirdep->ir_writehd); LIST_INIT(&newindirdep->ir_completehd); if (bp->b_blkno == bp->b_lblkno) { ufs_bmaparray(bp->b_vp, bp->b_lblkno, &blkno, bp, NULL, NULL); bp->b_blkno = blkno; } newindirdep->ir_freeblks = NULL; newindirdep->ir_savebp = getblk(ump->um_devvp, bp->b_blkno, bp->b_bcount, 0, 0, 0); newindirdep->ir_bp = bp; BUF_KERNPROC(newindirdep->ir_savebp); bcopy(bp->b_data, newindirdep->ir_savebp->b_data, bp->b_bcount); ACQUIRE_LOCK(ump); } indirdep = newindirdep; WORKLIST_INSERT(&bp->b_dep, &indirdep->ir_list); /* * If the block is not yet allocated we don't set DEPCOMPLETE so * that we don't free dependencies until the pointers are valid. * This could search b_dep for D_ALLOCDIRECT/D_ALLOCINDIR rather * than using the hash. */ if (newblk_lookup(mp, dbtofsb(fs, bp->b_blkno), 0, &newblk)) LIST_INSERT_HEAD(&newblk->nb_indirdeps, indirdep, ir_next); else indirdep->ir_state |= DEPCOMPLETE; return (indirdep); } /* * Called to finish the allocation of the "aip" allocated * by one of the two routines above. */ static struct freefrag * setup_allocindir_phase2(bp, ip, inodedep, aip, lbn) struct buf *bp; /* in-memory copy of the indirect block */ struct inode *ip; /* inode for file being extended */ struct inodedep *inodedep; /* Inodedep for ip */ struct allocindir *aip; /* allocindir allocated by the above routines */ ufs_lbn_t lbn; /* Logical block number for this block. */ { struct fs *fs; struct indirdep *indirdep; struct allocindir *oldaip; struct freefrag *freefrag; struct mount *mp; struct ufsmount *ump; mp = ITOVFS(ip); ump = VFSTOUFS(mp); LOCK_OWNED(ump); fs = ump->um_fs; if (bp->b_lblkno >= 0) panic("setup_allocindir_phase2: not indir blk"); KASSERT(aip->ai_offset >= 0 && aip->ai_offset < NINDIR(fs), ("setup_allocindir_phase2: Bad offset %d", aip->ai_offset)); indirdep = indirdep_lookup(mp, ip, bp); KASSERT(indirdep->ir_savebp != NULL, ("setup_allocindir_phase2 NULL ir_savebp")); aip->ai_indirdep = indirdep; /* * Check for an unwritten dependency for this indirect offset. If * there is, merge the old dependency into the new one. This happens * as a result of reallocblk only. */ freefrag = NULL; if (aip->ai_oldblkno != 0) { LIST_FOREACH(oldaip, &indirdep->ir_deplisthd, ai_next) { if (oldaip->ai_offset == aip->ai_offset) { freefrag = allocindir_merge(aip, oldaip); goto done; } } LIST_FOREACH(oldaip, &indirdep->ir_donehd, ai_next) { if (oldaip->ai_offset == aip->ai_offset) { freefrag = allocindir_merge(aip, oldaip); goto done; } } } done: LIST_INSERT_HEAD(&indirdep->ir_deplisthd, aip, ai_next); return (freefrag); } /* * Merge two allocindirs which refer to the same block. Move newblock * dependencies and setup the freefrags appropriately. */ static struct freefrag * allocindir_merge(aip, oldaip) struct allocindir *aip; struct allocindir *oldaip; { struct freefrag *freefrag; struct worklist *wk; if (oldaip->ai_newblkno != aip->ai_oldblkno) panic("allocindir_merge: blkno"); aip->ai_oldblkno = oldaip->ai_oldblkno; freefrag = aip->ai_freefrag; aip->ai_freefrag = oldaip->ai_freefrag; oldaip->ai_freefrag = NULL; KASSERT(freefrag != NULL, ("setup_allocindir_phase2: No freefrag")); /* * If we are tracking a new directory-block allocation, * move it from the old allocindir to the new allocindir. */ if ((wk = LIST_FIRST(&oldaip->ai_newdirblk)) != NULL) { WORKLIST_REMOVE(wk); if (!LIST_EMPTY(&oldaip->ai_newdirblk)) panic("allocindir_merge: extra newdirblk"); WORKLIST_INSERT(&aip->ai_newdirblk, wk); } /* * We can skip journaling for this freefrag and just complete * any pending journal work for the allocindir that is being * removed after the freefrag completes. */ if (freefrag->ff_jdep) cancel_jfreefrag(WK_JFREEFRAG(freefrag->ff_jdep)); LIST_REMOVE(oldaip, ai_next); freefrag->ff_jdep = (struct worklist *)cancel_newblk(&oldaip->ai_block, &freefrag->ff_list, &freefrag->ff_jwork); free_newblk(&oldaip->ai_block); return (freefrag); } static inline void setup_freedirect(freeblks, ip, i, needj) struct freeblks *freeblks; struct inode *ip; int i; int needj; { struct ufsmount *ump; ufs2_daddr_t blkno; int frags; blkno = DIP(ip, i_db[i]); if (blkno == 0) return; DIP_SET(ip, i_db[i], 0); ump = ITOUMP(ip); frags = sblksize(ump->um_fs, ip->i_size, i); frags = numfrags(ump->um_fs, frags); newfreework(ump, freeblks, NULL, i, blkno, frags, 0, needj); } static inline void setup_freeext(freeblks, ip, i, needj) struct freeblks *freeblks; struct inode *ip; int i; int needj; { struct ufsmount *ump; ufs2_daddr_t blkno; int frags; blkno = ip->i_din2->di_extb[i]; if (blkno == 0) return; ip->i_din2->di_extb[i] = 0; ump = ITOUMP(ip); frags = sblksize(ump->um_fs, ip->i_din2->di_extsize, i); frags = numfrags(ump->um_fs, frags); newfreework(ump, freeblks, NULL, -1 - i, blkno, frags, 0, needj); } static inline void setup_freeindir(freeblks, ip, i, lbn, needj) struct freeblks *freeblks; struct inode *ip; int i; ufs_lbn_t lbn; int needj; { struct ufsmount *ump; ufs2_daddr_t blkno; blkno = DIP(ip, i_ib[i]); if (blkno == 0) return; DIP_SET(ip, i_ib[i], 0); ump = ITOUMP(ip); newfreework(ump, freeblks, NULL, lbn, blkno, ump->um_fs->fs_frag, 0, needj); } static inline struct freeblks * newfreeblks(mp, ip) struct mount *mp; struct inode *ip; { struct freeblks *freeblks; freeblks = malloc(sizeof(struct freeblks), M_FREEBLKS, M_SOFTDEP_FLAGS|M_ZERO); workitem_alloc(&freeblks->fb_list, D_FREEBLKS, mp); LIST_INIT(&freeblks->fb_jblkdephd); LIST_INIT(&freeblks->fb_jwork); freeblks->fb_ref = 0; freeblks->fb_cgwait = 0; freeblks->fb_state = ATTACHED; freeblks->fb_uid = ip->i_uid; freeblks->fb_inum = ip->i_number; freeblks->fb_vtype = ITOV(ip)->v_type; freeblks->fb_modrev = DIP(ip, i_modrev); freeblks->fb_devvp = ITODEVVP(ip); freeblks->fb_chkcnt = 0; freeblks->fb_len = 0; return (freeblks); } static void trunc_indirdep(indirdep, freeblks, bp, off) struct indirdep *indirdep; struct freeblks *freeblks; struct buf *bp; int off; { struct allocindir *aip, *aipn; /* * The first set of allocindirs won't be in savedbp. */ LIST_FOREACH_SAFE(aip, &indirdep->ir_deplisthd, ai_next, aipn) if (aip->ai_offset > off) cancel_allocindir(aip, bp, freeblks, 1); LIST_FOREACH_SAFE(aip, &indirdep->ir_donehd, ai_next, aipn) if (aip->ai_offset > off) cancel_allocindir(aip, bp, freeblks, 1); /* * These will exist in savedbp. */ LIST_FOREACH_SAFE(aip, &indirdep->ir_writehd, ai_next, aipn) if (aip->ai_offset > off) cancel_allocindir(aip, NULL, freeblks, 0); LIST_FOREACH_SAFE(aip, &indirdep->ir_completehd, ai_next, aipn) if (aip->ai_offset > off) cancel_allocindir(aip, NULL, freeblks, 0); } /* * Follow the chain of indirects down to lastlbn creating a freework * structure for each. This will be used to start indir_trunc() at * the right offset and create the journal records for the parrtial * truncation. A second step will handle the truncated dependencies. */ static int setup_trunc_indir(freeblks, ip, lbn, lastlbn, blkno) struct freeblks *freeblks; struct inode *ip; ufs_lbn_t lbn; ufs_lbn_t lastlbn; ufs2_daddr_t blkno; { struct indirdep *indirdep; struct indirdep *indirn; struct freework *freework; struct newblk *newblk; struct mount *mp; struct ufsmount *ump; struct buf *bp; uint8_t *start; uint8_t *end; ufs_lbn_t lbnadd; int level; int error; int off; freework = NULL; if (blkno == 0) return (0); mp = freeblks->fb_list.wk_mp; ump = VFSTOUFS(mp); /* * Here, calls to VOP_BMAP() will fail. However, we already have * the on-disk address, so we just pass it to bread() instead of * having bread() attempt to calculate it using VOP_BMAP(). */ error = ffs_breadz(ump, ITOV(ip), lbn, blkptrtodb(ump, blkno), (int)mp->mnt_stat.f_iosize, NULL, NULL, 0, NOCRED, 0, NULL, &bp); if (error) return (error); level = lbn_level(lbn); lbnadd = lbn_offset(ump->um_fs, level); /* * Compute the offset of the last block we want to keep. Store * in the freework the first block we want to completely free. */ off = (lastlbn - -(lbn + level)) / lbnadd; if (off + 1 == NINDIR(ump->um_fs)) goto nowork; freework = newfreework(ump, freeblks, NULL, lbn, blkno, 0, off + 1, 0); /* * Link the freework into the indirdep. This will prevent any new * allocations from proceeding until we are finished with the * truncate and the block is written. */ ACQUIRE_LOCK(ump); indirdep = indirdep_lookup(mp, ip, bp); if (indirdep->ir_freeblks) panic("setup_trunc_indir: indirdep already truncated."); TAILQ_INSERT_TAIL(&indirdep->ir_trunc, freework, fw_next); freework->fw_indir = indirdep; /* * Cancel any allocindirs that will not make it to disk. * We have to do this for all copies of the indirdep that * live on this newblk. */ if ((indirdep->ir_state & DEPCOMPLETE) == 0) { if (newblk_lookup(mp, dbtofsb(ump->um_fs, bp->b_blkno), 0, &newblk) == 0) panic("setup_trunc_indir: lost block"); LIST_FOREACH(indirn, &newblk->nb_indirdeps, ir_next) trunc_indirdep(indirn, freeblks, bp, off); } else trunc_indirdep(indirdep, freeblks, bp, off); FREE_LOCK(ump); /* * Creation is protected by the buf lock. The saveddata is only * needed if a full truncation follows a partial truncation but it * is difficult to allocate in that case so we fetch it anyway. */ if (indirdep->ir_saveddata == NULL) indirdep->ir_saveddata = malloc(bp->b_bcount, M_INDIRDEP, M_SOFTDEP_FLAGS); nowork: /* Fetch the blkno of the child and the zero start offset. */ if (I_IS_UFS1(ip)) { blkno = ((ufs1_daddr_t *)bp->b_data)[off]; start = (uint8_t *)&((ufs1_daddr_t *)bp->b_data)[off+1]; } else { blkno = ((ufs2_daddr_t *)bp->b_data)[off]; start = (uint8_t *)&((ufs2_daddr_t *)bp->b_data)[off+1]; } if (freework) { /* Zero the truncated pointers. */ end = bp->b_data + bp->b_bcount; bzero(start, end - start); bdwrite(bp); } else bqrelse(bp); if (level == 0) return (0); lbn++; /* adjust level */ lbn -= (off * lbnadd); return setup_trunc_indir(freeblks, ip, lbn, lastlbn, blkno); } /* * Complete the partial truncation of an indirect block setup by * setup_trunc_indir(). This zeros the truncated pointers in the saved * copy and writes them to disk before the freeblks is allowed to complete. */ static void complete_trunc_indir(freework) struct freework *freework; { struct freework *fwn; struct indirdep *indirdep; struct ufsmount *ump; struct buf *bp; uintptr_t start; int count; ump = VFSTOUFS(freework->fw_list.wk_mp); LOCK_OWNED(ump); indirdep = freework->fw_indir; for (;;) { bp = indirdep->ir_bp; /* See if the block was discarded. */ if (bp == NULL) break; /* Inline part of getdirtybuf(). We dont want bremfree. */ if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) == 0) break; if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, LOCK_PTR(ump)) == 0) BUF_UNLOCK(bp); ACQUIRE_LOCK(ump); } freework->fw_state |= DEPCOMPLETE; TAILQ_REMOVE(&indirdep->ir_trunc, freework, fw_next); /* * Zero the pointers in the saved copy. */ if (indirdep->ir_state & UFS1FMT) start = sizeof(ufs1_daddr_t); else start = sizeof(ufs2_daddr_t); start *= freework->fw_start; count = indirdep->ir_savebp->b_bcount - start; start += (uintptr_t)indirdep->ir_savebp->b_data; bzero((char *)start, count); /* * We need to start the next truncation in the list if it has not * been started yet. */ fwn = TAILQ_FIRST(&indirdep->ir_trunc); if (fwn != NULL) { if (fwn->fw_freeblks == indirdep->ir_freeblks) TAILQ_REMOVE(&indirdep->ir_trunc, fwn, fw_next); if ((fwn->fw_state & ONWORKLIST) == 0) freework_enqueue(fwn); } /* * If bp is NULL the block was fully truncated, restore * the saved block list otherwise free it if it is no * longer needed. */ if (TAILQ_EMPTY(&indirdep->ir_trunc)) { if (bp == NULL) bcopy(indirdep->ir_saveddata, indirdep->ir_savebp->b_data, indirdep->ir_savebp->b_bcount); free(indirdep->ir_saveddata, M_INDIRDEP); indirdep->ir_saveddata = NULL; } /* * When bp is NULL there is a full truncation pending. We * must wait for this full truncation to be journaled before * we can release this freework because the disk pointers will * never be written as zero. */ if (bp == NULL) { if (LIST_EMPTY(&indirdep->ir_freeblks->fb_jblkdephd)) handle_written_freework(freework); else WORKLIST_INSERT(&indirdep->ir_freeblks->fb_freeworkhd, &freework->fw_list); if (fwn == NULL) { freework->fw_indir = (void *)0x0000deadbeef0000; bp = indirdep->ir_savebp; indirdep->ir_savebp = NULL; free_indirdep(indirdep); FREE_LOCK(ump); brelse(bp); ACQUIRE_LOCK(ump); } } else { /* Complete when the real copy is written. */ WORKLIST_INSERT(&bp->b_dep, &freework->fw_list); BUF_UNLOCK(bp); } } /* * Calculate the number of blocks we are going to release where datablocks * is the current total and length is the new file size. */ static ufs2_daddr_t blkcount(fs, datablocks, length) struct fs *fs; ufs2_daddr_t datablocks; off_t length; { off_t totblks, numblks; totblks = 0; numblks = howmany(length, fs->fs_bsize); if (numblks <= UFS_NDADDR) { totblks = howmany(length, fs->fs_fsize); goto out; } totblks = blkstofrags(fs, numblks); numblks -= UFS_NDADDR; /* * Count all single, then double, then triple indirects required. * Subtracting one indirects worth of blocks for each pass * acknowledges one of each pointed to by the inode. */ for (;;) { totblks += blkstofrags(fs, howmany(numblks, NINDIR(fs))); numblks -= NINDIR(fs); if (numblks <= 0) break; numblks = howmany(numblks, NINDIR(fs)); } out: totblks = fsbtodb(fs, totblks); /* * Handle sparse files. We can't reclaim more blocks than the inode * references. We will correct it later in handle_complete_freeblks() * when we know the real count. */ if (totblks > datablocks) return (0); return (datablocks - totblks); } /* * Handle freeblocks for journaled softupdate filesystems. * * Contrary to normal softupdates, we must preserve the block pointers in * indirects until their subordinates are free. This is to avoid journaling * every block that is freed which may consume more space than the journal * itself. The recovery program will see the free block journals at the * base of the truncated area and traverse them to reclaim space. The * pointers in the inode may be cleared immediately after the journal * records are written because each direct and indirect pointer in the * inode is recorded in a journal. This permits full truncation to proceed * asynchronously. The write order is journal -> inode -> cgs -> indirects. * * The algorithm is as follows: * 1) Traverse the in-memory state and create journal entries to release * the relevant blocks and full indirect trees. * 2) Traverse the indirect block chain adding partial truncation freework * records to indirects in the path to lastlbn. The freework will * prevent new allocation dependencies from being satisfied in this * indirect until the truncation completes. * 3) Read and lock the inode block, performing an update with the new size * and pointers. This prevents truncated data from becoming valid on * disk through step 4. * 4) Reap unsatisfied dependencies that are beyond the truncated area, * eliminate journal work for those records that do not require it. * 5) Schedule the journal records to be written followed by the inode block. * 6) Allocate any necessary frags for the end of file. * 7) Zero any partially truncated blocks. * * From this truncation proceeds asynchronously using the freework and * indir_trunc machinery. The file will not be extended again into a * partially truncated indirect block until all work is completed but * the normal dependency mechanism ensures that it is rolled back/forward * as appropriate. Further truncation may occur without delay and is * serialized in indir_trunc(). */ void softdep_journal_freeblocks(ip, cred, length, flags) struct inode *ip; /* The inode whose length is to be reduced */ struct ucred *cred; off_t length; /* The new length for the file */ int flags; /* IO_EXT and/or IO_NORMAL */ { struct freeblks *freeblks, *fbn; struct worklist *wk, *wkn; struct inodedep *inodedep; struct jblkdep *jblkdep; struct allocdirect *adp, *adpn; struct ufsmount *ump; struct fs *fs; struct buf *bp; struct vnode *vp; struct mount *mp; daddr_t dbn; ufs2_daddr_t extblocks, datablocks; ufs_lbn_t tmpval, lbn, lastlbn; int frags, lastoff, iboff, allocblock, needj, error, i; ump = ITOUMP(ip); mp = UFSTOVFS(ump); fs = ump->um_fs; KASSERT(MOUNTEDSOFTDEP(mp) != 0, ("softdep_journal_freeblocks called on non-softdep filesystem")); vp = ITOV(ip); needj = 1; iboff = -1; allocblock = 0; extblocks = 0; datablocks = 0; frags = 0; freeblks = newfreeblks(mp, ip); ACQUIRE_LOCK(ump); /* * If we're truncating a removed file that will never be written * we don't need to journal the block frees. The canceled journals * for the allocations will suffice. */ inodedep_lookup(mp, ip->i_number, DEPALLOC, &inodedep); if ((inodedep->id_state & (UNLINKED | DEPCOMPLETE)) == UNLINKED && length == 0) needj = 0; CTR3(KTR_SUJ, "softdep_journal_freeblks: ip %d length %ld needj %d", ip->i_number, length, needj); FREE_LOCK(ump); /* * Calculate the lbn that we are truncating to. This results in -1 * if we're truncating the 0 bytes. So it is the last lbn we want * to keep, not the first lbn we want to truncate. */ lastlbn = lblkno(fs, length + fs->fs_bsize - 1) - 1; lastoff = blkoff(fs, length); /* * Compute frags we are keeping in lastlbn. 0 means all. */ if (lastlbn >= 0 && lastlbn < UFS_NDADDR) { frags = fragroundup(fs, lastoff); /* adp offset of last valid allocdirect. */ iboff = lastlbn; } else if (lastlbn > 0) iboff = UFS_NDADDR; if (fs->fs_magic == FS_UFS2_MAGIC) extblocks = btodb(fragroundup(fs, ip->i_din2->di_extsize)); /* * Handle normal data blocks and indirects. This section saves * values used after the inode update to complete frag and indirect * truncation. */ if ((flags & IO_NORMAL) != 0) { /* * Handle truncation of whole direct and indirect blocks. */ for (i = iboff + 1; i < UFS_NDADDR; i++) setup_freedirect(freeblks, ip, i, needj); for (i = 0, tmpval = NINDIR(fs), lbn = UFS_NDADDR; i < UFS_NIADDR; i++, lbn += tmpval, tmpval *= NINDIR(fs)) { /* Release a whole indirect tree. */ if (lbn > lastlbn) { setup_freeindir(freeblks, ip, i, -lbn -i, needj); continue; } iboff = i + UFS_NDADDR; /* * Traverse partially truncated indirect tree. */ if (lbn <= lastlbn && lbn + tmpval - 1 > lastlbn) setup_trunc_indir(freeblks, ip, -lbn - i, lastlbn, DIP(ip, i_ib[i])); } /* * Handle partial truncation to a frag boundary. */ if (frags) { ufs2_daddr_t blkno; long oldfrags; oldfrags = blksize(fs, ip, lastlbn); blkno = DIP(ip, i_db[lastlbn]); if (blkno && oldfrags != frags) { oldfrags -= frags; oldfrags = numfrags(fs, oldfrags); blkno += numfrags(fs, frags); newfreework(ump, freeblks, NULL, lastlbn, blkno, oldfrags, 0, needj); if (needj) adjust_newfreework(freeblks, numfrags(fs, frags)); } else if (blkno == 0) allocblock = 1; } /* * Add a journal record for partial truncate if we are * handling indirect blocks. Non-indirects need no extra * journaling. */ if (length != 0 && lastlbn >= UFS_NDADDR) { UFS_INODE_SET_FLAG(ip, IN_TRUNCATED); newjtrunc(freeblks, length, 0); } ip->i_size = length; DIP_SET(ip, i_size, ip->i_size); UFS_INODE_SET_FLAG(ip, IN_SIZEMOD | IN_CHANGE); datablocks = DIP(ip, i_blocks) - extblocks; if (length != 0) datablocks = blkcount(fs, datablocks, length); freeblks->fb_len = length; } if ((flags & IO_EXT) != 0) { for (i = 0; i < UFS_NXADDR; i++) setup_freeext(freeblks, ip, i, needj); ip->i_din2->di_extsize = 0; datablocks += extblocks; UFS_INODE_SET_FLAG(ip, IN_SIZEMOD | IN_CHANGE); } #ifdef QUOTA /* Reference the quotas in case the block count is wrong in the end. */ quotaref(vp, freeblks->fb_quota); (void) chkdq(ip, -datablocks, NOCRED, FORCE); #endif freeblks->fb_chkcnt = -datablocks; UFS_LOCK(ump); fs->fs_pendingblocks += datablocks; UFS_UNLOCK(ump); DIP_SET(ip, i_blocks, DIP(ip, i_blocks) - datablocks); /* * Handle truncation of incomplete alloc direct dependencies. We * hold the inode block locked to prevent incomplete dependencies * from reaching the disk while we are eliminating those that * have been truncated. This is a partially inlined ffs_update(). */ ufs_itimes(vp); ip->i_flag &= ~(IN_LAZYACCESS | IN_LAZYMOD | IN_MODIFIED); dbn = fsbtodb(fs, ino_to_fsba(fs, ip->i_number)); error = ffs_breadz(ump, ump->um_devvp, dbn, dbn, (int)fs->fs_bsize, NULL, NULL, 0, cred, 0, NULL, &bp); if (error) { softdep_error("softdep_journal_freeblocks", error); return; } if (bp->b_bufsize == fs->fs_bsize) bp->b_flags |= B_CLUSTEROK; softdep_update_inodeblock(ip, bp, 0); if (ump->um_fstype == UFS1) { *((struct ufs1_dinode *)bp->b_data + ino_to_fsbo(fs, ip->i_number)) = *ip->i_din1; } else { ffs_update_dinode_ckhash(fs, ip->i_din2); *((struct ufs2_dinode *)bp->b_data + ino_to_fsbo(fs, ip->i_number)) = *ip->i_din2; } ACQUIRE_LOCK(ump); (void) inodedep_lookup(mp, ip->i_number, DEPALLOC, &inodedep); if ((inodedep->id_state & IOSTARTED) != 0) panic("softdep_setup_freeblocks: inode busy"); /* * Add the freeblks structure to the list of operations that * must await the zero'ed inode being written to disk. If we * still have a bitmap dependency (needj), then the inode * has never been written to disk, so we can process the * freeblks below once we have deleted the dependencies. */ if (needj) WORKLIST_INSERT(&bp->b_dep, &freeblks->fb_list); else freeblks->fb_state |= COMPLETE; if ((flags & IO_NORMAL) != 0) { TAILQ_FOREACH_SAFE(adp, &inodedep->id_inoupdt, ad_next, adpn) { if (adp->ad_offset > iboff) cancel_allocdirect(&inodedep->id_inoupdt, adp, freeblks); /* * Truncate the allocdirect. We could eliminate * or modify journal records as well. */ else if (adp->ad_offset == iboff && frags) adp->ad_newsize = frags; } } if ((flags & IO_EXT) != 0) while ((adp = TAILQ_FIRST(&inodedep->id_extupdt)) != NULL) cancel_allocdirect(&inodedep->id_extupdt, adp, freeblks); /* * Scan the bufwait list for newblock dependencies that will never * make it to disk. */ LIST_FOREACH_SAFE(wk, &inodedep->id_bufwait, wk_list, wkn) { if (wk->wk_type != D_ALLOCDIRECT) continue; adp = WK_ALLOCDIRECT(wk); if (((flags & IO_NORMAL) != 0 && (adp->ad_offset > iboff)) || ((flags & IO_EXT) != 0 && (adp->ad_state & EXTDATA))) { cancel_jfreeblk(freeblks, adp->ad_newblkno); cancel_newblk(WK_NEWBLK(wk), NULL, &freeblks->fb_jwork); WORKLIST_INSERT(&freeblks->fb_freeworkhd, wk); } } /* * Add journal work. */ LIST_FOREACH(jblkdep, &freeblks->fb_jblkdephd, jb_deps) add_to_journal(&jblkdep->jb_list); FREE_LOCK(ump); bdwrite(bp); /* * Truncate dependency structures beyond length. */ trunc_dependencies(ip, freeblks, lastlbn, frags, flags); /* * This is only set when we need to allocate a fragment because * none existed at the end of a frag-sized file. It handles only * allocating a new, zero filled block. */ if (allocblock) { ip->i_size = length - lastoff; DIP_SET(ip, i_size, ip->i_size); error = UFS_BALLOC(vp, length - 1, 1, cred, BA_CLRBUF, &bp); if (error != 0) { softdep_error("softdep_journal_freeblks", error); return; } ip->i_size = length; DIP_SET(ip, i_size, length); UFS_INODE_SET_FLAG(ip, IN_SIZEMOD | IN_CHANGE | IN_UPDATE); allocbuf(bp, frags); ffs_update(vp, 0); bawrite(bp); } else if (lastoff != 0 && vp->v_type != VDIR) { int size; /* * Zero the end of a truncated frag or block. */ size = sblksize(fs, length, lastlbn); error = bread(vp, lastlbn, size, cred, &bp); if (error == 0) { bzero((char *)bp->b_data + lastoff, size - lastoff); bawrite(bp); } else if (!ffs_fsfail_cleanup(ump, error)) { softdep_error("softdep_journal_freeblks", error); return; } } ACQUIRE_LOCK(ump); inodedep_lookup(mp, ip->i_number, DEPALLOC, &inodedep); TAILQ_INSERT_TAIL(&inodedep->id_freeblklst, freeblks, fb_next); freeblks->fb_state |= DEPCOMPLETE | ONDEPLIST; /* * We zero earlier truncations so they don't erroneously * update i_blocks. */ if (freeblks->fb_len == 0 && (flags & IO_NORMAL) != 0) TAILQ_FOREACH(fbn, &inodedep->id_freeblklst, fb_next) fbn->fb_len = 0; if ((freeblks->fb_state & ALLCOMPLETE) == ALLCOMPLETE && LIST_EMPTY(&freeblks->fb_jblkdephd)) freeblks->fb_state |= INPROGRESS; else freeblks = NULL; FREE_LOCK(ump); if (freeblks) handle_workitem_freeblocks(freeblks, 0); trunc_pages(ip, length, extblocks, flags); } /* * Flush a JOP_SYNC to the journal. */ void softdep_journal_fsync(ip) struct inode *ip; { struct jfsync *jfsync; struct ufsmount *ump; ump = ITOUMP(ip); KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(ump)) != 0, ("softdep_journal_fsync called on non-softdep filesystem")); if ((ip->i_flag & IN_TRUNCATED) == 0) return; ip->i_flag &= ~IN_TRUNCATED; jfsync = malloc(sizeof(*jfsync), M_JFSYNC, M_SOFTDEP_FLAGS | M_ZERO); workitem_alloc(&jfsync->jfs_list, D_JFSYNC, UFSTOVFS(ump)); jfsync->jfs_size = ip->i_size; jfsync->jfs_ino = ip->i_number; ACQUIRE_LOCK(ump); add_to_journal(&jfsync->jfs_list); jwait(&jfsync->jfs_list, MNT_WAIT); FREE_LOCK(ump); } /* * Block de-allocation dependencies. * * When blocks are de-allocated, the on-disk pointers must be nullified before * the blocks are made available for use by other files. (The true * requirement is that old pointers must be nullified before new on-disk * pointers are set. We chose this slightly more stringent requirement to * reduce complexity.) Our implementation handles this dependency by updating * the inode (or indirect block) appropriately but delaying the actual block * de-allocation (i.e., freemap and free space count manipulation) until * after the updated versions reach stable storage. After the disk is * updated, the blocks can be safely de-allocated whenever it is convenient. * This implementation handles only the common case of reducing a file's * length to zero. Other cases are handled by the conventional synchronous * write approach. * * The ffs implementation with which we worked double-checks * the state of the block pointers and file size as it reduces * a file's length. Some of this code is replicated here in our * soft updates implementation. The freeblks->fb_chkcnt field is * used to transfer a part of this information to the procedure * that eventually de-allocates the blocks. * * This routine should be called from the routine that shortens * a file's length, before the inode's size or block pointers * are modified. It will save the block pointer information for * later release and zero the inode so that the calling routine * can release it. */ void softdep_setup_freeblocks(ip, length, flags) struct inode *ip; /* The inode whose length is to be reduced */ off_t length; /* The new length for the file */ int flags; /* IO_EXT and/or IO_NORMAL */ { struct ufs1_dinode *dp1; struct ufs2_dinode *dp2; struct freeblks *freeblks; struct inodedep *inodedep; struct allocdirect *adp; struct ufsmount *ump; struct buf *bp; struct fs *fs; ufs2_daddr_t extblocks, datablocks; struct mount *mp; int i, delay, error; ufs_lbn_t tmpval; ufs_lbn_t lbn; ump = ITOUMP(ip); mp = UFSTOVFS(ump); KASSERT(MOUNTEDSOFTDEP(mp) != 0, ("softdep_setup_freeblocks called on non-softdep filesystem")); CTR2(KTR_SUJ, "softdep_setup_freeblks: ip %d length %ld", ip->i_number, length); KASSERT(length == 0, ("softdep_setup_freeblocks: non-zero length")); fs = ump->um_fs; if ((error = bread(ump->um_devvp, fsbtodb(fs, ino_to_fsba(fs, ip->i_number)), (int)fs->fs_bsize, NOCRED, &bp)) != 0) { if (!ffs_fsfail_cleanup(ump, error)) softdep_error("softdep_setup_freeblocks", error); return; } freeblks = newfreeblks(mp, ip); extblocks = 0; datablocks = 0; if (fs->fs_magic == FS_UFS2_MAGIC) extblocks = btodb(fragroundup(fs, ip->i_din2->di_extsize)); if ((flags & IO_NORMAL) != 0) { for (i = 0; i < UFS_NDADDR; i++) setup_freedirect(freeblks, ip, i, 0); for (i = 0, tmpval = NINDIR(fs), lbn = UFS_NDADDR; i < UFS_NIADDR; i++, lbn += tmpval, tmpval *= NINDIR(fs)) setup_freeindir(freeblks, ip, i, -lbn -i, 0); ip->i_size = 0; DIP_SET(ip, i_size, 0); UFS_INODE_SET_FLAG(ip, IN_SIZEMOD | IN_CHANGE); datablocks = DIP(ip, i_blocks) - extblocks; } if ((flags & IO_EXT) != 0) { for (i = 0; i < UFS_NXADDR; i++) setup_freeext(freeblks, ip, i, 0); ip->i_din2->di_extsize = 0; datablocks += extblocks; UFS_INODE_SET_FLAG(ip, IN_SIZEMOD | IN_CHANGE); } #ifdef QUOTA /* Reference the quotas in case the block count is wrong in the end. */ quotaref(ITOV(ip), freeblks->fb_quota); (void) chkdq(ip, -datablocks, NOCRED, FORCE); #endif freeblks->fb_chkcnt = -datablocks; UFS_LOCK(ump); fs->fs_pendingblocks += datablocks; UFS_UNLOCK(ump); DIP_SET(ip, i_blocks, DIP(ip, i_blocks) - datablocks); /* * Push the zero'ed inode to its disk buffer so that we are free * to delete its dependencies below. Once the dependencies are gone * the buffer can be safely released. */ if (ump->um_fstype == UFS1) { dp1 = ((struct ufs1_dinode *)bp->b_data + ino_to_fsbo(fs, ip->i_number)); ip->i_din1->di_freelink = dp1->di_freelink; *dp1 = *ip->i_din1; } else { dp2 = ((struct ufs2_dinode *)bp->b_data + ino_to_fsbo(fs, ip->i_number)); ip->i_din2->di_freelink = dp2->di_freelink; ffs_update_dinode_ckhash(fs, ip->i_din2); *dp2 = *ip->i_din2; } /* * Find and eliminate any inode dependencies. */ ACQUIRE_LOCK(ump); (void) inodedep_lookup(mp, ip->i_number, DEPALLOC, &inodedep); if ((inodedep->id_state & IOSTARTED) != 0) panic("softdep_setup_freeblocks: inode busy"); /* * Add the freeblks structure to the list of operations that * must await the zero'ed inode being written to disk. If we * still have a bitmap dependency (delay == 0), then the inode * has never been written to disk, so we can process the * freeblks below once we have deleted the dependencies. */ delay = (inodedep->id_state & DEPCOMPLETE); if (delay) WORKLIST_INSERT(&bp->b_dep, &freeblks->fb_list); else freeblks->fb_state |= COMPLETE; /* * Because the file length has been truncated to zero, any * pending block allocation dependency structures associated * with this inode are obsolete and can simply be de-allocated. * We must first merge the two dependency lists to get rid of * any duplicate freefrag structures, then purge the merged list. * If we still have a bitmap dependency, then the inode has never * been written to disk, so we can free any fragments without delay. */ if (flags & IO_NORMAL) { merge_inode_lists(&inodedep->id_newinoupdt, &inodedep->id_inoupdt); while ((adp = TAILQ_FIRST(&inodedep->id_inoupdt)) != NULL) cancel_allocdirect(&inodedep->id_inoupdt, adp, freeblks); } if (flags & IO_EXT) { merge_inode_lists(&inodedep->id_newextupdt, &inodedep->id_extupdt); while ((adp = TAILQ_FIRST(&inodedep->id_extupdt)) != NULL) cancel_allocdirect(&inodedep->id_extupdt, adp, freeblks); } FREE_LOCK(ump); bdwrite(bp); trunc_dependencies(ip, freeblks, -1, 0, flags); ACQUIRE_LOCK(ump); if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) != 0) (void) free_inodedep(inodedep); freeblks->fb_state |= DEPCOMPLETE; /* * If the inode with zeroed block pointers is now on disk * we can start freeing blocks. */ if ((freeblks->fb_state & ALLCOMPLETE) == ALLCOMPLETE) freeblks->fb_state |= INPROGRESS; else freeblks = NULL; FREE_LOCK(ump); if (freeblks) handle_workitem_freeblocks(freeblks, 0); trunc_pages(ip, length, extblocks, flags); } /* * Eliminate pages from the page cache that back parts of this inode and * adjust the vnode pager's idea of our size. This prevents stale data * from hanging around in the page cache. */ static void trunc_pages(ip, length, extblocks, flags) struct inode *ip; off_t length; ufs2_daddr_t extblocks; int flags; { struct vnode *vp; struct fs *fs; ufs_lbn_t lbn; off_t end, extend; vp = ITOV(ip); fs = ITOFS(ip); extend = OFF_TO_IDX(lblktosize(fs, -extblocks)); if ((flags & IO_EXT) != 0) vn_pages_remove(vp, extend, 0); if ((flags & IO_NORMAL) == 0) return; BO_LOCK(&vp->v_bufobj); drain_output(vp); BO_UNLOCK(&vp->v_bufobj); /* * The vnode pager eliminates file pages we eliminate indirects * below. */ vnode_pager_setsize(vp, length); /* * Calculate the end based on the last indirect we want to keep. If * the block extends into indirects we can just use the negative of * its lbn. Doubles and triples exist at lower numbers so we must * be careful not to remove those, if they exist. double and triple * indirect lbns do not overlap with others so it is not important * to verify how many levels are required. */ lbn = lblkno(fs, length); if (lbn >= UFS_NDADDR) { /* Calculate the virtual lbn of the triple indirect. */ lbn = -lbn - (UFS_NIADDR - 1); end = OFF_TO_IDX(lblktosize(fs, lbn)); } else end = extend; vn_pages_remove(vp, OFF_TO_IDX(OFF_MAX), end); } /* * See if the buf bp is in the range eliminated by truncation. */ static int trunc_check_buf(bp, blkoffp, lastlbn, lastoff, flags) struct buf *bp; int *blkoffp; ufs_lbn_t lastlbn; int lastoff; int flags; { ufs_lbn_t lbn; *blkoffp = 0; /* Only match ext/normal blocks as appropriate. */ if (((flags & IO_EXT) == 0 && (bp->b_xflags & BX_ALTDATA)) || ((flags & IO_NORMAL) == 0 && (bp->b_xflags & BX_ALTDATA) == 0)) return (0); /* ALTDATA is always a full truncation. */ if ((bp->b_xflags & BX_ALTDATA) != 0) return (1); /* -1 is full truncation. */ if (lastlbn == -1) return (1); /* * If this is a partial truncate we only want those * blocks and indirect blocks that cover the range * we're after. */ lbn = bp->b_lblkno; if (lbn < 0) lbn = -(lbn + lbn_level(lbn)); if (lbn < lastlbn) return (0); /* Here we only truncate lblkno if it's partial. */ if (lbn == lastlbn) { if (lastoff == 0) return (0); *blkoffp = lastoff; } return (1); } /* * Eliminate any dependencies that exist in memory beyond lblkno:off */ static void trunc_dependencies(ip, freeblks, lastlbn, lastoff, flags) struct inode *ip; struct freeblks *freeblks; ufs_lbn_t lastlbn; int lastoff; int flags; { struct bufobj *bo; struct vnode *vp; struct buf *bp; int blkoff; /* * We must wait for any I/O in progress to finish so that * all potential buffers on the dirty list will be visible. * Once they are all there, walk the list and get rid of * any dependencies. */ vp = ITOV(ip); bo = &vp->v_bufobj; BO_LOCK(bo); drain_output(vp); TAILQ_FOREACH(bp, &bo->bo_dirty.bv_hd, b_bobufs) bp->b_vflags &= ~BV_SCANNED; restart: TAILQ_FOREACH(bp, &bo->bo_dirty.bv_hd, b_bobufs) { if (bp->b_vflags & BV_SCANNED) continue; if (!trunc_check_buf(bp, &blkoff, lastlbn, lastoff, flags)) { bp->b_vflags |= BV_SCANNED; continue; } KASSERT(bp->b_bufobj == bo, ("Wrong object in buffer")); if ((bp = getdirtybuf(bp, BO_LOCKPTR(bo), MNT_WAIT)) == NULL) goto restart; BO_UNLOCK(bo); if (deallocate_dependencies(bp, freeblks, blkoff)) bqrelse(bp); else brelse(bp); BO_LOCK(bo); goto restart; } /* * Now do the work of vtruncbuf while also matching indirect blocks. */ TAILQ_FOREACH(bp, &bo->bo_clean.bv_hd, b_bobufs) bp->b_vflags &= ~BV_SCANNED; cleanrestart: TAILQ_FOREACH(bp, &bo->bo_clean.bv_hd, b_bobufs) { if (bp->b_vflags & BV_SCANNED) continue; if (!trunc_check_buf(bp, &blkoff, lastlbn, lastoff, flags)) { bp->b_vflags |= BV_SCANNED; continue; } if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, BO_LOCKPTR(bo)) == ENOLCK) { BO_LOCK(bo); goto cleanrestart; } bp->b_vflags |= BV_SCANNED; bremfree(bp); if (blkoff != 0) { allocbuf(bp, blkoff); bqrelse(bp); } else { bp->b_flags |= B_INVAL | B_NOCACHE | B_RELBUF; brelse(bp); } BO_LOCK(bo); goto cleanrestart; } drain_output(vp); BO_UNLOCK(bo); } static int cancel_pagedep(pagedep, freeblks, blkoff) struct pagedep *pagedep; struct freeblks *freeblks; int blkoff; { struct jremref *jremref; struct jmvref *jmvref; struct dirrem *dirrem, *tmp; int i; /* * Copy any directory remove dependencies to the list * to be processed after the freeblks proceeds. If * directory entry never made it to disk they * can be dumped directly onto the work list. */ LIST_FOREACH_SAFE(dirrem, &pagedep->pd_dirremhd, dm_next, tmp) { /* Skip this directory removal if it is intended to remain. */ if (dirrem->dm_offset < blkoff) continue; /* * If there are any dirrems we wait for the journal write * to complete and then restart the buf scan as the lock * has been dropped. */ while ((jremref = LIST_FIRST(&dirrem->dm_jremrefhd)) != NULL) { jwait(&jremref->jr_list, MNT_WAIT); return (ERESTART); } LIST_REMOVE(dirrem, dm_next); dirrem->dm_dirinum = pagedep->pd_ino; WORKLIST_INSERT(&freeblks->fb_freeworkhd, &dirrem->dm_list); } while ((jmvref = LIST_FIRST(&pagedep->pd_jmvrefhd)) != NULL) { jwait(&jmvref->jm_list, MNT_WAIT); return (ERESTART); } /* * When we're partially truncating a pagedep we just want to flush * journal entries and return. There can not be any adds in the * truncated portion of the directory and newblk must remain if * part of the block remains. */ if (blkoff != 0) { struct diradd *dap; LIST_FOREACH(dap, &pagedep->pd_pendinghd, da_pdlist) if (dap->da_offset > blkoff) panic("cancel_pagedep: diradd %p off %d > %d", dap, dap->da_offset, blkoff); for (i = 0; i < DAHASHSZ; i++) LIST_FOREACH(dap, &pagedep->pd_diraddhd[i], da_pdlist) if (dap->da_offset > blkoff) panic("cancel_pagedep: diradd %p off %d > %d", dap, dap->da_offset, blkoff); return (0); } /* * There should be no directory add dependencies present * as the directory could not be truncated until all * children were removed. */ KASSERT(LIST_FIRST(&pagedep->pd_pendinghd) == NULL, ("deallocate_dependencies: pendinghd != NULL")); for (i = 0; i < DAHASHSZ; i++) KASSERT(LIST_FIRST(&pagedep->pd_diraddhd[i]) == NULL, ("deallocate_dependencies: diraddhd != NULL")); if ((pagedep->pd_state & NEWBLOCK) != 0) free_newdirblk(pagedep->pd_newdirblk); if (free_pagedep(pagedep) == 0) panic("Failed to free pagedep %p", pagedep); return (0); } /* * Reclaim any dependency structures from a buffer that is about to * be reallocated to a new vnode. The buffer must be locked, thus, * no I/O completion operations can occur while we are manipulating * its associated dependencies. The mutex is held so that other I/O's * associated with related dependencies do not occur. */ static int deallocate_dependencies(bp, freeblks, off) struct buf *bp; struct freeblks *freeblks; int off; { struct indirdep *indirdep; struct pagedep *pagedep; struct worklist *wk, *wkn; struct ufsmount *ump; ump = softdep_bp_to_mp(bp); if (ump == NULL) goto done; ACQUIRE_LOCK(ump); LIST_FOREACH_SAFE(wk, &bp->b_dep, wk_list, wkn) { switch (wk->wk_type) { case D_INDIRDEP: indirdep = WK_INDIRDEP(wk); if (bp->b_lblkno >= 0 || bp->b_blkno != indirdep->ir_savebp->b_lblkno) panic("deallocate_dependencies: not indir"); cancel_indirdep(indirdep, bp, freeblks); continue; case D_PAGEDEP: pagedep = WK_PAGEDEP(wk); if (cancel_pagedep(pagedep, freeblks, off)) { FREE_LOCK(ump); return (ERESTART); } continue; case D_ALLOCINDIR: /* * Simply remove the allocindir, we'll find it via * the indirdep where we can clear pointers if * needed. */ WORKLIST_REMOVE(wk); continue; case D_FREEWORK: /* * A truncation is waiting for the zero'd pointers * to be written. It can be freed when the freeblks * is journaled. */ WORKLIST_REMOVE(wk); wk->wk_state |= ONDEPLIST; WORKLIST_INSERT(&freeblks->fb_freeworkhd, wk); break; case D_ALLOCDIRECT: if (off != 0) continue; /* FALLTHROUGH */ default: panic("deallocate_dependencies: Unexpected type %s", TYPENAME(wk->wk_type)); /* NOTREACHED */ } } FREE_LOCK(ump); done: /* * Don't throw away this buf, we were partially truncating and * some deps may always remain. */ if (off) { allocbuf(bp, off); bp->b_vflags |= BV_SCANNED; return (EBUSY); } bp->b_flags |= B_INVAL | B_NOCACHE; return (0); } /* * An allocdirect is being canceled due to a truncate. We must make sure * the journal entry is released in concert with the blkfree that releases * the storage. Completed journal entries must not be released until the * space is no longer pointed to by the inode or in the bitmap. */ static void cancel_allocdirect(adphead, adp, freeblks) struct allocdirectlst *adphead; struct allocdirect *adp; struct freeblks *freeblks; { struct freework *freework; struct newblk *newblk; struct worklist *wk; TAILQ_REMOVE(adphead, adp, ad_next); newblk = (struct newblk *)adp; freework = NULL; /* * Find the correct freework structure. */ LIST_FOREACH(wk, &freeblks->fb_freeworkhd, wk_list) { if (wk->wk_type != D_FREEWORK) continue; freework = WK_FREEWORK(wk); if (freework->fw_blkno == newblk->nb_newblkno) break; } if (freework == NULL) panic("cancel_allocdirect: Freework not found"); /* * If a newblk exists at all we still have the journal entry that * initiated the allocation so we do not need to journal the free. */ cancel_jfreeblk(freeblks, freework->fw_blkno); /* * If the journal hasn't been written the jnewblk must be passed * to the call to ffs_blkfree that reclaims the space. We accomplish * this by linking the journal dependency into the freework to be * freed when freework_freeblock() is called. If the journal has * been written we can simply reclaim the journal space when the * freeblks work is complete. */ freework->fw_jnewblk = cancel_newblk(newblk, &freework->fw_list, &freeblks->fb_jwork); WORKLIST_INSERT(&freeblks->fb_freeworkhd, &newblk->nb_list); } /* * Cancel a new block allocation. May be an indirect or direct block. We * remove it from various lists and return any journal record that needs to * be resolved by the caller. * * A special consideration is made for indirects which were never pointed * at on disk and will never be found once this block is released. */ static struct jnewblk * cancel_newblk(newblk, wk, wkhd) struct newblk *newblk; struct worklist *wk; struct workhead *wkhd; { struct jnewblk *jnewblk; CTR1(KTR_SUJ, "cancel_newblk: blkno %jd", newblk->nb_newblkno); newblk->nb_state |= GOINGAWAY; /* * Previously we traversed the completedhd on each indirdep * attached to this newblk to cancel them and gather journal * work. Since we need only the oldest journal segment and * the lowest point on the tree will always have the oldest * journal segment we are free to release the segments * of any subordinates and may leave the indirdep list to * indirdep_complete() when this newblk is freed. */ if (newblk->nb_state & ONDEPLIST) { newblk->nb_state &= ~ONDEPLIST; LIST_REMOVE(newblk, nb_deps); } if (newblk->nb_state & ONWORKLIST) WORKLIST_REMOVE(&newblk->nb_list); /* * If the journal entry hasn't been written we save a pointer to * the dependency that frees it until it is written or the * superseding operation completes. */ jnewblk = newblk->nb_jnewblk; if (jnewblk != NULL && wk != NULL) { newblk->nb_jnewblk = NULL; jnewblk->jn_dep = wk; } if (!LIST_EMPTY(&newblk->nb_jwork)) jwork_move(wkhd, &newblk->nb_jwork); /* * When truncating we must free the newdirblk early to remove * the pagedep from the hash before returning. */ if ((wk = LIST_FIRST(&newblk->nb_newdirblk)) != NULL) free_newdirblk(WK_NEWDIRBLK(wk)); if (!LIST_EMPTY(&newblk->nb_newdirblk)) panic("cancel_newblk: extra newdirblk"); return (jnewblk); } /* * Schedule the freefrag associated with a newblk to be released once * the pointers are written and the previous block is no longer needed. */ static void newblk_freefrag(newblk) struct newblk *newblk; { struct freefrag *freefrag; if (newblk->nb_freefrag == NULL) return; freefrag = newblk->nb_freefrag; newblk->nb_freefrag = NULL; freefrag->ff_state |= COMPLETE; if ((freefrag->ff_state & ALLCOMPLETE) == ALLCOMPLETE) add_to_worklist(&freefrag->ff_list, 0); } /* * Free a newblk. Generate a new freefrag work request if appropriate. * This must be called after the inode pointer and any direct block pointers * are valid or fully removed via truncate or frag extension. */ static void free_newblk(newblk) struct newblk *newblk; { struct indirdep *indirdep; struct worklist *wk; KASSERT(newblk->nb_jnewblk == NULL, ("free_newblk: jnewblk %p still attached", newblk->nb_jnewblk)); KASSERT(newblk->nb_list.wk_type != D_NEWBLK, ("free_newblk: unclaimed newblk")); LOCK_OWNED(VFSTOUFS(newblk->nb_list.wk_mp)); newblk_freefrag(newblk); if (newblk->nb_state & ONDEPLIST) LIST_REMOVE(newblk, nb_deps); if (newblk->nb_state & ONWORKLIST) WORKLIST_REMOVE(&newblk->nb_list); LIST_REMOVE(newblk, nb_hash); if ((wk = LIST_FIRST(&newblk->nb_newdirblk)) != NULL) free_newdirblk(WK_NEWDIRBLK(wk)); if (!LIST_EMPTY(&newblk->nb_newdirblk)) panic("free_newblk: extra newdirblk"); while ((indirdep = LIST_FIRST(&newblk->nb_indirdeps)) != NULL) indirdep_complete(indirdep); handle_jwork(&newblk->nb_jwork); WORKITEM_FREE(newblk, D_NEWBLK); } /* * Free a newdirblk. Clear the NEWBLOCK flag on its associated pagedep. */ static void free_newdirblk(newdirblk) struct newdirblk *newdirblk; { struct pagedep *pagedep; struct diradd *dap; struct worklist *wk; LOCK_OWNED(VFSTOUFS(newdirblk->db_list.wk_mp)); WORKLIST_REMOVE(&newdirblk->db_list); /* * If the pagedep is still linked onto the directory buffer * dependency chain, then some of the entries on the * pd_pendinghd list may not be committed to disk yet. In * this case, we will simply clear the NEWBLOCK flag and * let the pd_pendinghd list be processed when the pagedep * is next written. If the pagedep is no longer on the buffer * dependency chain, then all the entries on the pd_pending * list are committed to disk and we can free them here. */ pagedep = newdirblk->db_pagedep; pagedep->pd_state &= ~NEWBLOCK; if ((pagedep->pd_state & ONWORKLIST) == 0) { while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != NULL) free_diradd(dap, NULL); /* * If no dependencies remain, the pagedep will be freed. */ free_pagedep(pagedep); } /* Should only ever be one item in the list. */ while ((wk = LIST_FIRST(&newdirblk->db_mkdir)) != NULL) { WORKLIST_REMOVE(wk); handle_written_mkdir(WK_MKDIR(wk), MKDIR_BODY); } WORKITEM_FREE(newdirblk, D_NEWDIRBLK); } /* * Prepare an inode to be freed. The actual free operation is not * done until the zero'ed inode has been written to disk. */ void softdep_freefile(pvp, ino, mode) struct vnode *pvp; ino_t ino; int mode; { struct inode *ip = VTOI(pvp); struct inodedep *inodedep; struct freefile *freefile; struct freeblks *freeblks; struct ufsmount *ump; ump = ITOUMP(ip); KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(ump)) != 0, ("softdep_freefile called on non-softdep filesystem")); /* * This sets up the inode de-allocation dependency. */ freefile = malloc(sizeof(struct freefile), M_FREEFILE, M_SOFTDEP_FLAGS); workitem_alloc(&freefile->fx_list, D_FREEFILE, pvp->v_mount); freefile->fx_mode = mode; freefile->fx_oldinum = ino; freefile->fx_devvp = ump->um_devvp; LIST_INIT(&freefile->fx_jwork); UFS_LOCK(ump); ump->um_fs->fs_pendinginodes += 1; UFS_UNLOCK(ump); /* * If the inodedep does not exist, then the zero'ed inode has * been written to disk. If the allocated inode has never been * written to disk, then the on-disk inode is zero'ed. In either * case we can free the file immediately. If the journal was * canceled before being written the inode will never make it to * disk and we must send the canceled journal entrys to * ffs_freefile() to be cleared in conjunction with the bitmap. * Any blocks waiting on the inode to write can be safely freed * here as it will never been written. */ ACQUIRE_LOCK(ump); inodedep_lookup(pvp->v_mount, ino, 0, &inodedep); if (inodedep) { /* * Clear out freeblks that no longer need to reference * this inode. */ while ((freeblks = TAILQ_FIRST(&inodedep->id_freeblklst)) != NULL) { TAILQ_REMOVE(&inodedep->id_freeblklst, freeblks, fb_next); freeblks->fb_state &= ~ONDEPLIST; } /* * Remove this inode from the unlinked list. */ if (inodedep->id_state & UNLINKED) { /* * Save the journal work to be freed with the bitmap * before we clear UNLINKED. Otherwise it can be lost * if the inode block is written. */ handle_bufwait(inodedep, &freefile->fx_jwork); clear_unlinked_inodedep(inodedep); /* * Re-acquire inodedep as we've dropped the * per-filesystem lock in clear_unlinked_inodedep(). */ inodedep_lookup(pvp->v_mount, ino, 0, &inodedep); } } if (inodedep == NULL || check_inode_unwritten(inodedep)) { FREE_LOCK(ump); handle_workitem_freefile(freefile); return; } if ((inodedep->id_state & DEPCOMPLETE) == 0) inodedep->id_state |= GOINGAWAY; WORKLIST_INSERT(&inodedep->id_inowait, &freefile->fx_list); FREE_LOCK(ump); if (ip->i_number == ino) UFS_INODE_SET_FLAG(ip, IN_MODIFIED); } /* * Check to see if an inode has never been written to disk. If * so free the inodedep and return success, otherwise return failure. * * If we still have a bitmap dependency, then the inode has never * been written to disk. Drop the dependency as it is no longer * necessary since the inode is being deallocated. We set the * ALLCOMPLETE flags since the bitmap now properly shows that the * inode is not allocated. Even if the inode is actively being * written, it has been rolled back to its zero'ed state, so we * are ensured that a zero inode is what is on the disk. For short * lived files, this change will usually result in removing all the * dependencies from the inode so that it can be freed immediately. */ static int check_inode_unwritten(inodedep) struct inodedep *inodedep; { LOCK_OWNED(VFSTOUFS(inodedep->id_list.wk_mp)); if ((inodedep->id_state & (DEPCOMPLETE | UNLINKED)) != 0 || !LIST_EMPTY(&inodedep->id_dirremhd) || !LIST_EMPTY(&inodedep->id_pendinghd) || !LIST_EMPTY(&inodedep->id_bufwait) || !LIST_EMPTY(&inodedep->id_inowait) || !TAILQ_EMPTY(&inodedep->id_inoreflst) || !TAILQ_EMPTY(&inodedep->id_inoupdt) || !TAILQ_EMPTY(&inodedep->id_newinoupdt) || !TAILQ_EMPTY(&inodedep->id_extupdt) || !TAILQ_EMPTY(&inodedep->id_newextupdt) || !TAILQ_EMPTY(&inodedep->id_freeblklst) || inodedep->id_mkdiradd != NULL || inodedep->id_nlinkdelta != 0) return (0); /* * Another process might be in initiate_write_inodeblock_ufs[12] * trying to allocate memory without holding "Softdep Lock". */ if ((inodedep->id_state & IOSTARTED) != 0 && inodedep->id_savedino1 == NULL) return (0); if (inodedep->id_state & ONDEPLIST) LIST_REMOVE(inodedep, id_deps); inodedep->id_state &= ~ONDEPLIST; inodedep->id_state |= ALLCOMPLETE; inodedep->id_bmsafemap = NULL; if (inodedep->id_state & ONWORKLIST) WORKLIST_REMOVE(&inodedep->id_list); if (inodedep->id_savedino1 != NULL) { free(inodedep->id_savedino1, M_SAVEDINO); inodedep->id_savedino1 = NULL; } if (free_inodedep(inodedep) == 0) panic("check_inode_unwritten: busy inode"); return (1); } static int check_inodedep_free(inodedep) struct inodedep *inodedep; { LOCK_OWNED(VFSTOUFS(inodedep->id_list.wk_mp)); if ((inodedep->id_state & ALLCOMPLETE) != ALLCOMPLETE || !LIST_EMPTY(&inodedep->id_dirremhd) || !LIST_EMPTY(&inodedep->id_pendinghd) || !LIST_EMPTY(&inodedep->id_bufwait) || !LIST_EMPTY(&inodedep->id_inowait) || !TAILQ_EMPTY(&inodedep->id_inoreflst) || !TAILQ_EMPTY(&inodedep->id_inoupdt) || !TAILQ_EMPTY(&inodedep->id_newinoupdt) || !TAILQ_EMPTY(&inodedep->id_extupdt) || !TAILQ_EMPTY(&inodedep->id_newextupdt) || !TAILQ_EMPTY(&inodedep->id_freeblklst) || inodedep->id_mkdiradd != NULL || inodedep->id_nlinkdelta != 0 || inodedep->id_savedino1 != NULL) return (0); return (1); } /* * Try to free an inodedep structure. Return 1 if it could be freed. */ static int free_inodedep(inodedep) struct inodedep *inodedep; { LOCK_OWNED(VFSTOUFS(inodedep->id_list.wk_mp)); if ((inodedep->id_state & (ONWORKLIST | UNLINKED)) != 0 || !check_inodedep_free(inodedep)) return (0); if (inodedep->id_state & ONDEPLIST) LIST_REMOVE(inodedep, id_deps); LIST_REMOVE(inodedep, id_hash); WORKITEM_FREE(inodedep, D_INODEDEP); return (1); } /* * Free the block referenced by a freework structure. The parent freeblks * structure is released and completed when the final cg bitmap reaches * the disk. This routine may be freeing a jnewblk which never made it to * disk in which case we do not have to wait as the operation is undone * in memory immediately. */ static void freework_freeblock(freework, key) struct freework *freework; u_long key; { struct freeblks *freeblks; struct jnewblk *jnewblk; struct ufsmount *ump; struct workhead wkhd; struct fs *fs; int bsize; int needj; ump = VFSTOUFS(freework->fw_list.wk_mp); LOCK_OWNED(ump); /* * Handle partial truncate separately. */ if (freework->fw_indir) { complete_trunc_indir(freework); return; } freeblks = freework->fw_freeblks; fs = ump->um_fs; needj = MOUNTEDSUJ(freeblks->fb_list.wk_mp) != 0; bsize = lfragtosize(fs, freework->fw_frags); LIST_INIT(&wkhd); /* * DEPCOMPLETE is cleared in indirblk_insert() if the block lives * on the indirblk hashtable and prevents premature freeing. */ freework->fw_state |= DEPCOMPLETE; /* * SUJ needs to wait for the segment referencing freed indirect * blocks to expire so that we know the checker will not confuse * a re-allocated indirect block with its old contents. */ if (needj && freework->fw_lbn <= -UFS_NDADDR) indirblk_insert(freework); /* * If we are canceling an existing jnewblk pass it to the free * routine, otherwise pass the freeblk which will ultimately * release the freeblks. If we're not journaling, we can just * free the freeblks immediately. */ jnewblk = freework->fw_jnewblk; if (jnewblk != NULL) { cancel_jnewblk(jnewblk, &wkhd); needj = 0; } else if (needj) { freework->fw_state |= DELAYEDFREE; freeblks->fb_cgwait++; WORKLIST_INSERT(&wkhd, &freework->fw_list); } FREE_LOCK(ump); freeblks_free(ump, freeblks, btodb(bsize)); CTR4(KTR_SUJ, "freework_freeblock: ino %jd blkno %jd lbn %jd size %d", freeblks->fb_inum, freework->fw_blkno, freework->fw_lbn, bsize); ffs_blkfree(ump, fs, freeblks->fb_devvp, freework->fw_blkno, bsize, freeblks->fb_inum, freeblks->fb_vtype, &wkhd, key); ACQUIRE_LOCK(ump); /* * The jnewblk will be discarded and the bits in the map never * made it to disk. We can immediately free the freeblk. */ if (needj == 0) handle_written_freework(freework); } /* * We enqueue freework items that need processing back on the freeblks and * add the freeblks to the worklist. This makes it easier to find all work * required to flush a truncation in process_truncates(). */ static void freework_enqueue(freework) struct freework *freework; { struct freeblks *freeblks; freeblks = freework->fw_freeblks; if ((freework->fw_state & INPROGRESS) == 0) WORKLIST_INSERT(&freeblks->fb_freeworkhd, &freework->fw_list); if ((freeblks->fb_state & (ONWORKLIST | INPROGRESS | ALLCOMPLETE)) == ALLCOMPLETE && LIST_EMPTY(&freeblks->fb_jblkdephd)) add_to_worklist(&freeblks->fb_list, WK_NODELAY); } /* * Start, continue, or finish the process of freeing an indirect block tree. * The free operation may be paused at any point with fw_off containing the * offset to restart from. This enables us to implement some flow control * for large truncates which may fan out and generate a huge number of * dependencies. */ static void handle_workitem_indirblk(freework) struct freework *freework; { struct freeblks *freeblks; struct ufsmount *ump; struct fs *fs; freeblks = freework->fw_freeblks; ump = VFSTOUFS(freeblks->fb_list.wk_mp); fs = ump->um_fs; if (freework->fw_state & DEPCOMPLETE) { handle_written_freework(freework); return; } if (freework->fw_off == NINDIR(fs)) { freework_freeblock(freework, SINGLETON_KEY); return; } freework->fw_state |= INPROGRESS; FREE_LOCK(ump); indir_trunc(freework, fsbtodb(fs, freework->fw_blkno), freework->fw_lbn); ACQUIRE_LOCK(ump); } /* * Called when a freework structure attached to a cg buf is written. The * ref on either the parent or the freeblks structure is released and * the freeblks is added back to the worklist if there is more work to do. */ static void handle_written_freework(freework) struct freework *freework; { struct freeblks *freeblks; struct freework *parent; freeblks = freework->fw_freeblks; parent = freework->fw_parent; if (freework->fw_state & DELAYEDFREE) freeblks->fb_cgwait--; freework->fw_state |= COMPLETE; if ((freework->fw_state & ALLCOMPLETE) == ALLCOMPLETE) WORKITEM_FREE(freework, D_FREEWORK); if (parent) { if (--parent->fw_ref == 0) freework_enqueue(parent); return; } if (--freeblks->fb_ref != 0) return; if ((freeblks->fb_state & (ALLCOMPLETE | ONWORKLIST | INPROGRESS)) == ALLCOMPLETE && LIST_EMPTY(&freeblks->fb_jblkdephd)) add_to_worklist(&freeblks->fb_list, WK_NODELAY); } /* * This workitem routine performs the block de-allocation. * The workitem is added to the pending list after the updated * inode block has been written to disk. As mentioned above, * checks regarding the number of blocks de-allocated (compared * to the number of blocks allocated for the file) are also * performed in this function. */ static int handle_workitem_freeblocks(freeblks, flags) struct freeblks *freeblks; int flags; { struct freework *freework; struct newblk *newblk; struct allocindir *aip; struct ufsmount *ump; struct worklist *wk; u_long key; KASSERT(LIST_EMPTY(&freeblks->fb_jblkdephd), ("handle_workitem_freeblocks: Journal entries not written.")); ump = VFSTOUFS(freeblks->fb_list.wk_mp); key = ffs_blkrelease_start(ump, freeblks->fb_devvp, freeblks->fb_inum); ACQUIRE_LOCK(ump); while ((wk = LIST_FIRST(&freeblks->fb_freeworkhd)) != NULL) { WORKLIST_REMOVE(wk); switch (wk->wk_type) { case D_DIRREM: wk->wk_state |= COMPLETE; add_to_worklist(wk, 0); continue; case D_ALLOCDIRECT: free_newblk(WK_NEWBLK(wk)); continue; case D_ALLOCINDIR: aip = WK_ALLOCINDIR(wk); freework = NULL; if (aip->ai_state & DELAYEDFREE) { FREE_LOCK(ump); freework = newfreework(ump, freeblks, NULL, aip->ai_lbn, aip->ai_newblkno, ump->um_fs->fs_frag, 0, 0); ACQUIRE_LOCK(ump); } newblk = WK_NEWBLK(wk); if (newblk->nb_jnewblk) { freework->fw_jnewblk = newblk->nb_jnewblk; newblk->nb_jnewblk->jn_dep = &freework->fw_list; newblk->nb_jnewblk = NULL; } free_newblk(newblk); continue; case D_FREEWORK: freework = WK_FREEWORK(wk); if (freework->fw_lbn <= -UFS_NDADDR) handle_workitem_indirblk(freework); else freework_freeblock(freework, key); continue; default: panic("handle_workitem_freeblocks: Unknown type %s", TYPENAME(wk->wk_type)); } } if (freeblks->fb_ref != 0) { freeblks->fb_state &= ~INPROGRESS; wake_worklist(&freeblks->fb_list); freeblks = NULL; } FREE_LOCK(ump); ffs_blkrelease_finish(ump, key); if (freeblks) return handle_complete_freeblocks(freeblks, flags); return (0); } /* * Handle completion of block free via truncate. This allows fs_pending * to track the actual free block count more closely than if we only updated * it at the end. We must be careful to handle cases where the block count * on free was incorrect. */ static void freeblks_free(ump, freeblks, blocks) struct ufsmount *ump; struct freeblks *freeblks; int blocks; { struct fs *fs; ufs2_daddr_t remain; UFS_LOCK(ump); remain = -freeblks->fb_chkcnt; freeblks->fb_chkcnt += blocks; if (remain > 0) { if (remain < blocks) blocks = remain; fs = ump->um_fs; fs->fs_pendingblocks -= blocks; } UFS_UNLOCK(ump); } /* * Once all of the freework workitems are complete we can retire the * freeblocks dependency and any journal work awaiting completion. This * can not be called until all other dependencies are stable on disk. */ static int handle_complete_freeblocks(freeblks, flags) struct freeblks *freeblks; int flags; { struct inodedep *inodedep; struct inode *ip; struct vnode *vp; struct fs *fs; struct ufsmount *ump; ufs2_daddr_t spare; ump = VFSTOUFS(freeblks->fb_list.wk_mp); fs = ump->um_fs; flags = LK_EXCLUSIVE | flags; spare = freeblks->fb_chkcnt; /* * If we did not release the expected number of blocks we may have * to adjust the inode block count here. Only do so if it wasn't * a truncation to zero and the modrev still matches. */ if (spare && freeblks->fb_len != 0) { if (ffs_vgetf(freeblks->fb_list.wk_mp, freeblks->fb_inum, flags, &vp, FFSV_FORCEINSMQ) != 0) return (EBUSY); ip = VTOI(vp); if (ip->i_mode == 0) { vgone(vp); } else if (DIP(ip, i_modrev) == freeblks->fb_modrev) { DIP_SET(ip, i_blocks, DIP(ip, i_blocks) - spare); UFS_INODE_SET_FLAG(ip, IN_CHANGE); /* * We must wait so this happens before the * journal is reclaimed. */ ffs_update(vp, 1); } vput(vp); } if (spare < 0) { UFS_LOCK(ump); fs->fs_pendingblocks += spare; UFS_UNLOCK(ump); } #ifdef QUOTA /* Handle spare. */ if (spare) quotaadj(freeblks->fb_quota, ump, -spare); quotarele(freeblks->fb_quota); #endif ACQUIRE_LOCK(ump); if (freeblks->fb_state & ONDEPLIST) { inodedep_lookup(freeblks->fb_list.wk_mp, freeblks->fb_inum, 0, &inodedep); TAILQ_REMOVE(&inodedep->id_freeblklst, freeblks, fb_next); freeblks->fb_state &= ~ONDEPLIST; if (TAILQ_EMPTY(&inodedep->id_freeblklst)) free_inodedep(inodedep); } /* * All of the freeblock deps must be complete prior to this call * so it's now safe to complete earlier outstanding journal entries. */ handle_jwork(&freeblks->fb_jwork); WORKITEM_FREE(freeblks, D_FREEBLKS); FREE_LOCK(ump); return (0); } /* * Release blocks associated with the freeblks and stored in the indirect * block dbn. If level is greater than SINGLE, the block is an indirect block * and recursive calls to indirtrunc must be used to cleanse other indirect * blocks. * * This handles partial and complete truncation of blocks. Partial is noted * with goingaway == 0. In this case the freework is completed after the * zero'd indirects are written to disk. For full truncation the freework * is completed after the block is freed. */ static void indir_trunc(freework, dbn, lbn) struct freework *freework; ufs2_daddr_t dbn; ufs_lbn_t lbn; { struct freework *nfreework; struct workhead wkhd; struct freeblks *freeblks; struct buf *bp; struct fs *fs; struct indirdep *indirdep; struct mount *mp; struct ufsmount *ump; ufs1_daddr_t *bap1; ufs2_daddr_t nb, nnb, *bap2; ufs_lbn_t lbnadd, nlbn; u_long key; int nblocks, ufs1fmt, freedblocks; int goingaway, freedeps, needj, level, cnt, i, error; freeblks = freework->fw_freeblks; mp = freeblks->fb_list.wk_mp; ump = VFSTOUFS(mp); fs = ump->um_fs; /* * Get buffer of block pointers to be freed. There are three cases: * * 1) Partial truncate caches the indirdep pointer in the freework * which provides us a back copy to the save bp which holds the * pointers we want to clear. When this completes the zero * pointers are written to the real copy. * 2) The indirect is being completely truncated, cancel_indirdep() * eliminated the real copy and placed the indirdep on the saved * copy. The indirdep and buf are discarded when this completes. * 3) The indirect was not in memory, we read a copy off of the disk * using the devvp and drop and invalidate the buffer when we're * done. */ goingaway = 1; indirdep = NULL; if (freework->fw_indir != NULL) { goingaway = 0; indirdep = freework->fw_indir; bp = indirdep->ir_savebp; if (bp == NULL || bp->b_blkno != dbn) panic("indir_trunc: Bad saved buf %p blkno %jd", bp, (intmax_t)dbn); } else if ((bp = incore(&freeblks->fb_devvp->v_bufobj, dbn)) != NULL) { /* * The lock prevents the buf dep list from changing and * indirects on devvp should only ever have one dependency. */ indirdep = WK_INDIRDEP(LIST_FIRST(&bp->b_dep)); if (indirdep == NULL || (indirdep->ir_state & GOINGAWAY) == 0) panic("indir_trunc: Bad indirdep %p from buf %p", indirdep, bp); } else { error = ffs_breadz(ump, freeblks->fb_devvp, dbn, dbn, (int)fs->fs_bsize, NULL, NULL, 0, NOCRED, 0, NULL, &bp); if (error) return; } ACQUIRE_LOCK(ump); /* Protects against a race with complete_trunc_indir(). */ freework->fw_state &= ~INPROGRESS; /* * If we have an indirdep we need to enforce the truncation order * and discard it when it is complete. */ if (indirdep) { if (freework != TAILQ_FIRST(&indirdep->ir_trunc) && !TAILQ_EMPTY(&indirdep->ir_trunc)) { /* * Add the complete truncate to the list on the * indirdep to enforce in-order processing. */ if (freework->fw_indir == NULL) TAILQ_INSERT_TAIL(&indirdep->ir_trunc, freework, fw_next); FREE_LOCK(ump); return; } /* * If we're goingaway, free the indirdep. Otherwise it will * linger until the write completes. */ if (goingaway) { KASSERT(indirdep->ir_savebp == bp, ("indir_trunc: losing ir_savebp %p", indirdep->ir_savebp)); indirdep->ir_savebp = NULL; free_indirdep(indirdep); } } FREE_LOCK(ump); /* Initialize pointers depending on block size. */ if (ump->um_fstype == UFS1) { bap1 = (ufs1_daddr_t *)bp->b_data; nb = bap1[freework->fw_off]; ufs1fmt = 1; bap2 = NULL; } else { bap2 = (ufs2_daddr_t *)bp->b_data; nb = bap2[freework->fw_off]; ufs1fmt = 0; bap1 = NULL; } level = lbn_level(lbn); needj = MOUNTEDSUJ(UFSTOVFS(ump)) != 0; lbnadd = lbn_offset(fs, level); nblocks = btodb(fs->fs_bsize); nfreework = freework; freedeps = 0; cnt = 0; /* * Reclaim blocks. Traverses into nested indirect levels and * arranges for the current level to be freed when subordinates * are free when journaling. */ key = ffs_blkrelease_start(ump, freeblks->fb_devvp, freeblks->fb_inum); for (i = freework->fw_off; i < NINDIR(fs); i++, nb = nnb) { if (UFS_CHECK_BLKNO(mp, freeblks->fb_inum, nb, fs->fs_bsize) != 0) nb = 0; if (i != NINDIR(fs) - 1) { if (ufs1fmt) nnb = bap1[i+1]; else nnb = bap2[i+1]; } else nnb = 0; if (nb == 0) continue; cnt++; if (level != 0) { nlbn = (lbn + 1) - (i * lbnadd); if (needj != 0) { nfreework = newfreework(ump, freeblks, freework, nlbn, nb, fs->fs_frag, 0, 0); freedeps++; } indir_trunc(nfreework, fsbtodb(fs, nb), nlbn); } else { struct freedep *freedep; /* * Attempt to aggregate freedep dependencies for * all blocks being released to the same CG. */ LIST_INIT(&wkhd); if (needj != 0 && (nnb == 0 || (dtog(fs, nb) != dtog(fs, nnb)))) { freedep = newfreedep(freework); WORKLIST_INSERT_UNLOCKED(&wkhd, &freedep->fd_list); freedeps++; } CTR3(KTR_SUJ, "indir_trunc: ino %jd blkno %jd size %d", freeblks->fb_inum, nb, fs->fs_bsize); ffs_blkfree(ump, fs, freeblks->fb_devvp, nb, fs->fs_bsize, freeblks->fb_inum, freeblks->fb_vtype, &wkhd, key); } } ffs_blkrelease_finish(ump, key); if (goingaway) { bp->b_flags |= B_INVAL | B_NOCACHE; brelse(bp); } freedblocks = 0; if (level == 0) freedblocks = (nblocks * cnt); if (needj == 0) freedblocks += nblocks; freeblks_free(ump, freeblks, freedblocks); /* * If we are journaling set up the ref counts and offset so this * indirect can be completed when its children are free. */ if (needj) { ACQUIRE_LOCK(ump); freework->fw_off = i; freework->fw_ref += freedeps; freework->fw_ref -= NINDIR(fs) + 1; if (level == 0) freeblks->fb_cgwait += freedeps; if (freework->fw_ref == 0) freework_freeblock(freework, SINGLETON_KEY); FREE_LOCK(ump); return; } /* * If we're not journaling we can free the indirect now. */ dbn = dbtofsb(fs, dbn); CTR3(KTR_SUJ, "indir_trunc 2: ino %jd blkno %jd size %d", freeblks->fb_inum, dbn, fs->fs_bsize); ffs_blkfree(ump, fs, freeblks->fb_devvp, dbn, fs->fs_bsize, freeblks->fb_inum, freeblks->fb_vtype, NULL, SINGLETON_KEY); /* Non SUJ softdep does single-threaded truncations. */ if (freework->fw_blkno == dbn) { freework->fw_state |= ALLCOMPLETE; ACQUIRE_LOCK(ump); handle_written_freework(freework); FREE_LOCK(ump); } return; } /* * Cancel an allocindir when it is removed via truncation. When bp is not * NULL the indirect never appeared on disk and is scheduled to be freed * independently of the indir so we can more easily track journal work. */ static void cancel_allocindir(aip, bp, freeblks, trunc) struct allocindir *aip; struct buf *bp; struct freeblks *freeblks; int trunc; { struct indirdep *indirdep; struct freefrag *freefrag; struct newblk *newblk; newblk = (struct newblk *)aip; LIST_REMOVE(aip, ai_next); /* * We must eliminate the pointer in bp if it must be freed on its * own due to partial truncate or pending journal work. */ if (bp && (trunc || newblk->nb_jnewblk)) { /* * Clear the pointer and mark the aip to be freed * directly if it never existed on disk. */ aip->ai_state |= DELAYEDFREE; indirdep = aip->ai_indirdep; if (indirdep->ir_state & UFS1FMT) ((ufs1_daddr_t *)bp->b_data)[aip->ai_offset] = 0; else ((ufs2_daddr_t *)bp->b_data)[aip->ai_offset] = 0; } /* * When truncating the previous pointer will be freed via * savedbp. Eliminate the freefrag which would dup free. */ if (trunc && (freefrag = newblk->nb_freefrag) != NULL) { newblk->nb_freefrag = NULL; if (freefrag->ff_jdep) cancel_jfreefrag( WK_JFREEFRAG(freefrag->ff_jdep)); jwork_move(&freeblks->fb_jwork, &freefrag->ff_jwork); WORKITEM_FREE(freefrag, D_FREEFRAG); } /* * If the journal hasn't been written the jnewblk must be passed * to the call to ffs_blkfree that reclaims the space. We accomplish * this by leaving the journal dependency on the newblk to be freed * when a freework is created in handle_workitem_freeblocks(). */ cancel_newblk(newblk, NULL, &freeblks->fb_jwork); WORKLIST_INSERT(&freeblks->fb_freeworkhd, &newblk->nb_list); } /* * Create the mkdir dependencies for . and .. in a new directory. Link them * in to a newdirblk so any subsequent additions are tracked properly. The * caller is responsible for adding the mkdir1 dependency to the journal * and updating id_mkdiradd. This function returns with the per-filesystem * lock held. */ static struct mkdir * setup_newdir(dap, newinum, dinum, newdirbp, mkdirp) struct diradd *dap; ino_t newinum; ino_t dinum; struct buf *newdirbp; struct mkdir **mkdirp; { struct newblk *newblk; struct pagedep *pagedep; struct inodedep *inodedep; struct newdirblk *newdirblk; struct mkdir *mkdir1, *mkdir2; struct worklist *wk; struct jaddref *jaddref; struct ufsmount *ump; struct mount *mp; mp = dap->da_list.wk_mp; ump = VFSTOUFS(mp); newdirblk = malloc(sizeof(struct newdirblk), M_NEWDIRBLK, M_SOFTDEP_FLAGS); workitem_alloc(&newdirblk->db_list, D_NEWDIRBLK, mp); LIST_INIT(&newdirblk->db_mkdir); mkdir1 = malloc(sizeof(struct mkdir), M_MKDIR, M_SOFTDEP_FLAGS); workitem_alloc(&mkdir1->md_list, D_MKDIR, mp); mkdir1->md_state = ATTACHED | MKDIR_BODY; mkdir1->md_diradd = dap; mkdir1->md_jaddref = NULL; mkdir2 = malloc(sizeof(struct mkdir), M_MKDIR, M_SOFTDEP_FLAGS); workitem_alloc(&mkdir2->md_list, D_MKDIR, mp); mkdir2->md_state = ATTACHED | MKDIR_PARENT; mkdir2->md_diradd = dap; mkdir2->md_jaddref = NULL; if (MOUNTEDSUJ(mp) == 0) { mkdir1->md_state |= DEPCOMPLETE; mkdir2->md_state |= DEPCOMPLETE; } /* * Dependency on "." and ".." being written to disk. */ mkdir1->md_buf = newdirbp; ACQUIRE_LOCK(VFSTOUFS(mp)); LIST_INSERT_HEAD(&ump->softdep_mkdirlisthd, mkdir1, md_mkdirs); /* * We must link the pagedep, allocdirect, and newdirblk for * the initial file page so the pointer to the new directory * is not written until the directory contents are live and * any subsequent additions are not marked live until the * block is reachable via the inode. */ if (pagedep_lookup(mp, newdirbp, newinum, 0, 0, &pagedep) == 0) panic("setup_newdir: lost pagedep"); LIST_FOREACH(wk, &newdirbp->b_dep, wk_list) if (wk->wk_type == D_ALLOCDIRECT) break; if (wk == NULL) panic("setup_newdir: lost allocdirect"); if (pagedep->pd_state & NEWBLOCK) panic("setup_newdir: NEWBLOCK already set"); newblk = WK_NEWBLK(wk); pagedep->pd_state |= NEWBLOCK; pagedep->pd_newdirblk = newdirblk; newdirblk->db_pagedep = pagedep; WORKLIST_INSERT(&newblk->nb_newdirblk, &newdirblk->db_list); WORKLIST_INSERT(&newdirblk->db_mkdir, &mkdir1->md_list); /* * Look up the inodedep for the parent directory so that we * can link mkdir2 into the pending dotdot jaddref or * the inode write if there is none. If the inode is * ALLCOMPLETE and no jaddref is present all dependencies have * been satisfied and mkdir2 can be freed. */ inodedep_lookup(mp, dinum, 0, &inodedep); if (MOUNTEDSUJ(mp)) { if (inodedep == NULL) panic("setup_newdir: Lost parent."); jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst, inoreflst); KASSERT(jaddref != NULL && jaddref->ja_parent == newinum && (jaddref->ja_state & MKDIR_PARENT), ("setup_newdir: bad dotdot jaddref %p", jaddref)); LIST_INSERT_HEAD(&ump->softdep_mkdirlisthd, mkdir2, md_mkdirs); mkdir2->md_jaddref = jaddref; jaddref->ja_mkdir = mkdir2; } else if (inodedep == NULL || (inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) { dap->da_state &= ~MKDIR_PARENT; WORKITEM_FREE(mkdir2, D_MKDIR); mkdir2 = NULL; } else { LIST_INSERT_HEAD(&ump->softdep_mkdirlisthd, mkdir2, md_mkdirs); WORKLIST_INSERT(&inodedep->id_bufwait, &mkdir2->md_list); } *mkdirp = mkdir2; return (mkdir1); } /* * Directory entry addition dependencies. * * When adding a new directory entry, the inode (with its incremented link * count) must be written to disk before the directory entry's pointer to it. * Also, if the inode is newly allocated, the corresponding freemap must be * updated (on disk) before the directory entry's pointer. These requirements * are met via undo/redo on the directory entry's pointer, which consists * simply of the inode number. * * As directory entries are added and deleted, the free space within a * directory block can become fragmented. The ufs filesystem will compact * a fragmented directory block to make space for a new entry. When this * occurs, the offsets of previously added entries change. Any "diradd" * dependency structures corresponding to these entries must be updated with * the new offsets. */ /* * This routine is called after the in-memory inode's link * count has been incremented, but before the directory entry's * pointer to the inode has been set. */ int softdep_setup_directory_add(bp, dp, diroffset, newinum, newdirbp, isnewblk) struct buf *bp; /* buffer containing directory block */ struct inode *dp; /* inode for directory */ off_t diroffset; /* offset of new entry in directory */ ino_t newinum; /* inode referenced by new directory entry */ struct buf *newdirbp; /* non-NULL => contents of new mkdir */ int isnewblk; /* entry is in a newly allocated block */ { int offset; /* offset of new entry within directory block */ ufs_lbn_t lbn; /* block in directory containing new entry */ struct fs *fs; struct diradd *dap; struct newblk *newblk; struct pagedep *pagedep; struct inodedep *inodedep; struct newdirblk *newdirblk; struct mkdir *mkdir1, *mkdir2; struct jaddref *jaddref; struct ufsmount *ump; struct mount *mp; int isindir; mp = ITOVFS(dp); ump = VFSTOUFS(mp); KASSERT(MOUNTEDSOFTDEP(mp) != 0, ("softdep_setup_directory_add called on non-softdep filesystem")); /* * Whiteouts have no dependencies. */ if (newinum == UFS_WINO) { if (newdirbp != NULL) bdwrite(newdirbp); return (0); } jaddref = NULL; mkdir1 = mkdir2 = NULL; fs = ump->um_fs; lbn = lblkno(fs, diroffset); offset = blkoff(fs, diroffset); dap = malloc(sizeof(struct diradd), M_DIRADD, M_SOFTDEP_FLAGS|M_ZERO); workitem_alloc(&dap->da_list, D_DIRADD, mp); dap->da_offset = offset; dap->da_newinum = newinum; dap->da_state = ATTACHED; LIST_INIT(&dap->da_jwork); isindir = bp->b_lblkno >= UFS_NDADDR; newdirblk = NULL; if (isnewblk && (isindir ? blkoff(fs, diroffset) : fragoff(fs, diroffset)) == 0) { newdirblk = malloc(sizeof(struct newdirblk), M_NEWDIRBLK, M_SOFTDEP_FLAGS); workitem_alloc(&newdirblk->db_list, D_NEWDIRBLK, mp); LIST_INIT(&newdirblk->db_mkdir); } /* * If we're creating a new directory setup the dependencies and set * the dap state to wait for them. Otherwise it's COMPLETE and * we can move on. */ if (newdirbp == NULL) { dap->da_state |= DEPCOMPLETE; ACQUIRE_LOCK(ump); } else { dap->da_state |= MKDIR_BODY | MKDIR_PARENT; mkdir1 = setup_newdir(dap, newinum, dp->i_number, newdirbp, &mkdir2); } /* * Link into parent directory pagedep to await its being written. */ pagedep_lookup(mp, bp, dp->i_number, lbn, DEPALLOC, &pagedep); #ifdef INVARIANTS if (diradd_lookup(pagedep, offset) != NULL) panic("softdep_setup_directory_add: %p already at off %d\n", diradd_lookup(pagedep, offset), offset); #endif dap->da_pagedep = pagedep; LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)], dap, da_pdlist); inodedep_lookup(mp, newinum, DEPALLOC, &inodedep); /* * If we're journaling, link the diradd into the jaddref so it * may be completed after the journal entry is written. Otherwise, * link the diradd into its inodedep. If the inode is not yet * written place it on the bufwait list, otherwise do the post-inode * write processing to put it on the id_pendinghd list. */ if (MOUNTEDSUJ(mp)) { jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst, inoreflst); KASSERT(jaddref != NULL && jaddref->ja_parent == dp->i_number, ("softdep_setup_directory_add: bad jaddref %p", jaddref)); jaddref->ja_diroff = diroffset; jaddref->ja_diradd = dap; add_to_journal(&jaddref->ja_list); } else if ((inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) diradd_inode_written(dap, inodedep); else WORKLIST_INSERT(&inodedep->id_bufwait, &dap->da_list); /* * Add the journal entries for . and .. links now that the primary * link is written. */ if (mkdir1 != NULL && MOUNTEDSUJ(mp)) { jaddref = (struct jaddref *)TAILQ_PREV(&jaddref->ja_ref, inoreflst, if_deps); KASSERT(jaddref != NULL && jaddref->ja_ino == jaddref->ja_parent && (jaddref->ja_state & MKDIR_BODY), ("softdep_setup_directory_add: bad dot jaddref %p", jaddref)); mkdir1->md_jaddref = jaddref; jaddref->ja_mkdir = mkdir1; /* * It is important that the dotdot journal entry * is added prior to the dot entry since dot writes * both the dot and dotdot links. These both must * be added after the primary link for the journal * to remain consistent. */ add_to_journal(&mkdir2->md_jaddref->ja_list); add_to_journal(&jaddref->ja_list); } /* * If we are adding a new directory remember this diradd so that if * we rename it we can keep the dot and dotdot dependencies. If * we are adding a new name for an inode that has a mkdiradd we * must be in rename and we have to move the dot and dotdot * dependencies to this new name. The old name is being orphaned * soon. */ if (mkdir1 != NULL) { if (inodedep->id_mkdiradd != NULL) panic("softdep_setup_directory_add: Existing mkdir"); inodedep->id_mkdiradd = dap; } else if (inodedep->id_mkdiradd) merge_diradd(inodedep, dap); if (newdirblk != NULL) { /* * There is nothing to do if we are already tracking * this block. */ if ((pagedep->pd_state & NEWBLOCK) != 0) { WORKITEM_FREE(newdirblk, D_NEWDIRBLK); FREE_LOCK(ump); return (0); } if (newblk_lookup(mp, dbtofsb(fs, bp->b_blkno), 0, &newblk) == 0) panic("softdep_setup_directory_add: lost entry"); WORKLIST_INSERT(&newblk->nb_newdirblk, &newdirblk->db_list); pagedep->pd_state |= NEWBLOCK; pagedep->pd_newdirblk = newdirblk; newdirblk->db_pagedep = pagedep; FREE_LOCK(ump); /* * If we extended into an indirect signal direnter to sync. */ if (isindir) return (1); return (0); } FREE_LOCK(ump); return (0); } /* * This procedure is called to change the offset of a directory * entry when compacting a directory block which must be owned * exclusively by the caller. Note that the actual entry movement * must be done in this procedure to ensure that no I/O completions * occur while the move is in progress. */ void softdep_change_directoryentry_offset(bp, dp, base, oldloc, newloc, entrysize) struct buf *bp; /* Buffer holding directory block. */ struct inode *dp; /* inode for directory */ caddr_t base; /* address of dp->i_offset */ caddr_t oldloc; /* address of old directory location */ caddr_t newloc; /* address of new directory location */ int entrysize; /* size of directory entry */ { int offset, oldoffset, newoffset; struct pagedep *pagedep; struct jmvref *jmvref; struct diradd *dap; struct direct *de; struct mount *mp; struct ufsmount *ump; ufs_lbn_t lbn; int flags; mp = ITOVFS(dp); ump = VFSTOUFS(mp); KASSERT(MOUNTEDSOFTDEP(mp) != 0, ("softdep_change_directoryentry_offset called on " "non-softdep filesystem")); de = (struct direct *)oldloc; jmvref = NULL; flags = 0; /* * Moves are always journaled as it would be too complex to * determine if any affected adds or removes are present in the * journal. */ if (MOUNTEDSUJ(mp)) { flags = DEPALLOC; jmvref = newjmvref(dp, de->d_ino, dp->i_offset + (oldloc - base), dp->i_offset + (newloc - base)); } lbn = lblkno(ump->um_fs, dp->i_offset); offset = blkoff(ump->um_fs, dp->i_offset); oldoffset = offset + (oldloc - base); newoffset = offset + (newloc - base); ACQUIRE_LOCK(ump); if (pagedep_lookup(mp, bp, dp->i_number, lbn, flags, &pagedep) == 0) goto done; dap = diradd_lookup(pagedep, oldoffset); if (dap) { dap->da_offset = newoffset; newoffset = DIRADDHASH(newoffset); oldoffset = DIRADDHASH(oldoffset); if ((dap->da_state & ALLCOMPLETE) != ALLCOMPLETE && newoffset != oldoffset) { LIST_REMOVE(dap, da_pdlist); LIST_INSERT_HEAD(&pagedep->pd_diraddhd[newoffset], dap, da_pdlist); } } done: if (jmvref) { jmvref->jm_pagedep = pagedep; LIST_INSERT_HEAD(&pagedep->pd_jmvrefhd, jmvref, jm_deps); add_to_journal(&jmvref->jm_list); } bcopy(oldloc, newloc, entrysize); FREE_LOCK(ump); } /* * Move the mkdir dependencies and journal work from one diradd to another * when renaming a directory. The new name must depend on the mkdir deps * completing as the old name did. Directories can only have one valid link * at a time so one must be canonical. */ static void merge_diradd(inodedep, newdap) struct inodedep *inodedep; struct diradd *newdap; { struct diradd *olddap; struct mkdir *mkdir, *nextmd; struct ufsmount *ump; short state; olddap = inodedep->id_mkdiradd; inodedep->id_mkdiradd = newdap; if ((olddap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) { newdap->da_state &= ~DEPCOMPLETE; ump = VFSTOUFS(inodedep->id_list.wk_mp); for (mkdir = LIST_FIRST(&ump->softdep_mkdirlisthd); mkdir; mkdir = nextmd) { nextmd = LIST_NEXT(mkdir, md_mkdirs); if (mkdir->md_diradd != olddap) continue; mkdir->md_diradd = newdap; state = mkdir->md_state & (MKDIR_PARENT | MKDIR_BODY); newdap->da_state |= state; olddap->da_state &= ~state; if ((olddap->da_state & (MKDIR_PARENT | MKDIR_BODY)) == 0) break; } if ((olddap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) panic("merge_diradd: unfound ref"); } /* * Any mkdir related journal items are not safe to be freed until * the new name is stable. */ jwork_move(&newdap->da_jwork, &olddap->da_jwork); olddap->da_state |= DEPCOMPLETE; complete_diradd(olddap); } /* * Move the diradd to the pending list when all diradd dependencies are * complete. */ static void complete_diradd(dap) struct diradd *dap; { struct pagedep *pagedep; if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) { if (dap->da_state & DIRCHG) pagedep = dap->da_previous->dm_pagedep; else pagedep = dap->da_pagedep; LIST_REMOVE(dap, da_pdlist); LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist); } } /* * Cancel a diradd when a dirrem overlaps with it. We must cancel the journal * add entries and conditonally journal the remove. */ static void cancel_diradd(dap, dirrem, jremref, dotremref, dotdotremref) struct diradd *dap; struct dirrem *dirrem; struct jremref *jremref; struct jremref *dotremref; struct jremref *dotdotremref; { struct inodedep *inodedep; struct jaddref *jaddref; struct inoref *inoref; struct ufsmount *ump; struct mkdir *mkdir; /* * If no remove references were allocated we're on a non-journaled * filesystem and can skip the cancel step. */ if (jremref == NULL) { free_diradd(dap, NULL); return; } /* * Cancel the primary name an free it if it does not require * journaling. */ if (inodedep_lookup(dap->da_list.wk_mp, dap->da_newinum, 0, &inodedep) != 0) { /* Abort the addref that reference this diradd. */ TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) { if (inoref->if_list.wk_type != D_JADDREF) continue; jaddref = (struct jaddref *)inoref; if (jaddref->ja_diradd != dap) continue; if (cancel_jaddref(jaddref, inodedep, &dirrem->dm_jwork) == 0) { free_jremref(jremref); jremref = NULL; } break; } } /* * Cancel subordinate names and free them if they do not require * journaling. */ if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) { ump = VFSTOUFS(dap->da_list.wk_mp); LIST_FOREACH(mkdir, &ump->softdep_mkdirlisthd, md_mkdirs) { if (mkdir->md_diradd != dap) continue; if ((jaddref = mkdir->md_jaddref) == NULL) continue; mkdir->md_jaddref = NULL; if (mkdir->md_state & MKDIR_PARENT) { if (cancel_jaddref(jaddref, NULL, &dirrem->dm_jwork) == 0) { free_jremref(dotdotremref); dotdotremref = NULL; } } else { if (cancel_jaddref(jaddref, inodedep, &dirrem->dm_jwork) == 0) { free_jremref(dotremref); dotremref = NULL; } } } } if (jremref) journal_jremref(dirrem, jremref, inodedep); if (dotremref) journal_jremref(dirrem, dotremref, inodedep); if (dotdotremref) journal_jremref(dirrem, dotdotremref, NULL); jwork_move(&dirrem->dm_jwork, &dap->da_jwork); free_diradd(dap, &dirrem->dm_jwork); } /* * Free a diradd dependency structure. */ static void free_diradd(dap, wkhd) struct diradd *dap; struct workhead *wkhd; { struct dirrem *dirrem; struct pagedep *pagedep; struct inodedep *inodedep; struct mkdir *mkdir, *nextmd; struct ufsmount *ump; ump = VFSTOUFS(dap->da_list.wk_mp); LOCK_OWNED(ump); LIST_REMOVE(dap, da_pdlist); if (dap->da_state & ONWORKLIST) WORKLIST_REMOVE(&dap->da_list); if ((dap->da_state & DIRCHG) == 0) { pagedep = dap->da_pagedep; } else { dirrem = dap->da_previous; pagedep = dirrem->dm_pagedep; dirrem->dm_dirinum = pagedep->pd_ino; dirrem->dm_state |= COMPLETE; if (LIST_EMPTY(&dirrem->dm_jremrefhd)) add_to_worklist(&dirrem->dm_list, 0); } if (inodedep_lookup(pagedep->pd_list.wk_mp, dap->da_newinum, 0, &inodedep) != 0) if (inodedep->id_mkdiradd == dap) inodedep->id_mkdiradd = NULL; if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) { for (mkdir = LIST_FIRST(&ump->softdep_mkdirlisthd); mkdir; mkdir = nextmd) { nextmd = LIST_NEXT(mkdir, md_mkdirs); if (mkdir->md_diradd != dap) continue; dap->da_state &= ~(mkdir->md_state & (MKDIR_PARENT | MKDIR_BODY)); LIST_REMOVE(mkdir, md_mkdirs); if (mkdir->md_state & ONWORKLIST) WORKLIST_REMOVE(&mkdir->md_list); if (mkdir->md_jaddref != NULL) panic("free_diradd: Unexpected jaddref"); WORKITEM_FREE(mkdir, D_MKDIR); if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) == 0) break; } if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) panic("free_diradd: unfound ref"); } if (inodedep) free_inodedep(inodedep); /* * Free any journal segments waiting for the directory write. */ handle_jwork(&dap->da_jwork); WORKITEM_FREE(dap, D_DIRADD); } /* * Directory entry removal dependencies. * * When removing a directory entry, the entry's inode pointer must be * zero'ed on disk before the corresponding inode's link count is decremented * (possibly freeing the inode for re-use). This dependency is handled by * updating the directory entry but delaying the inode count reduction until * after the directory block has been written to disk. After this point, the * inode count can be decremented whenever it is convenient. */ /* * This routine should be called immediately after removing * a directory entry. The inode's link count should not be * decremented by the calling procedure -- the soft updates * code will do this task when it is safe. */ void softdep_setup_remove(bp, dp, ip, isrmdir) struct buf *bp; /* buffer containing directory block */ struct inode *dp; /* inode for the directory being modified */ struct inode *ip; /* inode for directory entry being removed */ int isrmdir; /* indicates if doing RMDIR */ { struct dirrem *dirrem, *prevdirrem; struct inodedep *inodedep; struct ufsmount *ump; int direct; ump = ITOUMP(ip); KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(ump)) != 0, ("softdep_setup_remove called on non-softdep filesystem")); /* * Allocate a new dirrem if appropriate and ACQUIRE_LOCK. We want * newdirrem() to setup the full directory remove which requires * isrmdir > 1. */ dirrem = newdirrem(bp, dp, ip, isrmdir, &prevdirrem); /* * Add the dirrem to the inodedep's pending remove list for quick * discovery later. */ if (inodedep_lookup(UFSTOVFS(ump), ip->i_number, 0, &inodedep) == 0) panic("softdep_setup_remove: Lost inodedep."); KASSERT((inodedep->id_state & UNLINKED) == 0, ("inode unlinked")); dirrem->dm_state |= ONDEPLIST; LIST_INSERT_HEAD(&inodedep->id_dirremhd, dirrem, dm_inonext); /* * If the COMPLETE flag is clear, then there were no active * entries and we want to roll back to a zeroed entry until * the new inode is committed to disk. If the COMPLETE flag is * set then we have deleted an entry that never made it to * disk. If the entry we deleted resulted from a name change, * then the old name still resides on disk. We cannot delete * its inode (returned to us in prevdirrem) until the zeroed * directory entry gets to disk. The new inode has never been * referenced on the disk, so can be deleted immediately. */ if ((dirrem->dm_state & COMPLETE) == 0) { LIST_INSERT_HEAD(&dirrem->dm_pagedep->pd_dirremhd, dirrem, dm_next); FREE_LOCK(ump); } else { if (prevdirrem != NULL) LIST_INSERT_HEAD(&dirrem->dm_pagedep->pd_dirremhd, prevdirrem, dm_next); dirrem->dm_dirinum = dirrem->dm_pagedep->pd_ino; direct = LIST_EMPTY(&dirrem->dm_jremrefhd); FREE_LOCK(ump); if (direct) handle_workitem_remove(dirrem, 0); } } /* * Check for an entry matching 'offset' on both the pd_dirraddhd list and the * pd_pendinghd list of a pagedep. */ static struct diradd * diradd_lookup(pagedep, offset) struct pagedep *pagedep; int offset; { struct diradd *dap; LIST_FOREACH(dap, &pagedep->pd_diraddhd[DIRADDHASH(offset)], da_pdlist) if (dap->da_offset == offset) return (dap); LIST_FOREACH(dap, &pagedep->pd_pendinghd, da_pdlist) if (dap->da_offset == offset) return (dap); return (NULL); } /* * Search for a .. diradd dependency in a directory that is being removed. * If the directory was renamed to a new parent we have a diradd rather * than a mkdir for the .. entry. We need to cancel it now before * it is found in truncate(). */ static struct jremref * cancel_diradd_dotdot(ip, dirrem, jremref) struct inode *ip; struct dirrem *dirrem; struct jremref *jremref; { struct pagedep *pagedep; struct diradd *dap; struct worklist *wk; if (pagedep_lookup(ITOVFS(ip), NULL, ip->i_number, 0, 0, &pagedep) == 0) return (jremref); dap = diradd_lookup(pagedep, DOTDOT_OFFSET); if (dap == NULL) return (jremref); cancel_diradd(dap, dirrem, jremref, NULL, NULL); /* * Mark any journal work as belonging to the parent so it is freed * with the .. reference. */ LIST_FOREACH(wk, &dirrem->dm_jwork, wk_list) wk->wk_state |= MKDIR_PARENT; return (NULL); } /* * Cancel the MKDIR_PARENT mkdir component of a diradd when we're going to * replace it with a dirrem/diradd pair as a result of re-parenting a * directory. This ensures that we don't simultaneously have a mkdir and * a diradd for the same .. entry. */ static struct jremref * cancel_mkdir_dotdot(ip, dirrem, jremref) struct inode *ip; struct dirrem *dirrem; struct jremref *jremref; { struct inodedep *inodedep; struct jaddref *jaddref; struct ufsmount *ump; struct mkdir *mkdir; struct diradd *dap; struct mount *mp; mp = ITOVFS(ip); if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0) return (jremref); dap = inodedep->id_mkdiradd; if (dap == NULL || (dap->da_state & MKDIR_PARENT) == 0) return (jremref); ump = VFSTOUFS(inodedep->id_list.wk_mp); for (mkdir = LIST_FIRST(&ump->softdep_mkdirlisthd); mkdir; mkdir = LIST_NEXT(mkdir, md_mkdirs)) if (mkdir->md_diradd == dap && mkdir->md_state & MKDIR_PARENT) break; if (mkdir == NULL) panic("cancel_mkdir_dotdot: Unable to find mkdir\n"); if ((jaddref = mkdir->md_jaddref) != NULL) { mkdir->md_jaddref = NULL; jaddref->ja_state &= ~MKDIR_PARENT; if (inodedep_lookup(mp, jaddref->ja_ino, 0, &inodedep) == 0) panic("cancel_mkdir_dotdot: Lost parent inodedep"); if (cancel_jaddref(jaddref, inodedep, &dirrem->dm_jwork)) { journal_jremref(dirrem, jremref, inodedep); jremref = NULL; } } if (mkdir->md_state & ONWORKLIST) WORKLIST_REMOVE(&mkdir->md_list); mkdir->md_state |= ALLCOMPLETE; complete_mkdir(mkdir); return (jremref); } static void journal_jremref(dirrem, jremref, inodedep) struct dirrem *dirrem; struct jremref *jremref; struct inodedep *inodedep; { if (inodedep == NULL) if (inodedep_lookup(jremref->jr_list.wk_mp, jremref->jr_ref.if_ino, 0, &inodedep) == 0) panic("journal_jremref: Lost inodedep"); LIST_INSERT_HEAD(&dirrem->dm_jremrefhd, jremref, jr_deps); TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, &jremref->jr_ref, if_deps); add_to_journal(&jremref->jr_list); } static void dirrem_journal(dirrem, jremref, dotremref, dotdotremref) struct dirrem *dirrem; struct jremref *jremref; struct jremref *dotremref; struct jremref *dotdotremref; { struct inodedep *inodedep; if (inodedep_lookup(jremref->jr_list.wk_mp, jremref->jr_ref.if_ino, 0, &inodedep) == 0) panic("dirrem_journal: Lost inodedep"); journal_jremref(dirrem, jremref, inodedep); if (dotremref) journal_jremref(dirrem, dotremref, inodedep); if (dotdotremref) journal_jremref(dirrem, dotdotremref, NULL); } /* * Allocate a new dirrem if appropriate and return it along with * its associated pagedep. Called without a lock, returns with lock. */ static struct dirrem * newdirrem(bp, dp, ip, isrmdir, prevdirremp) struct buf *bp; /* buffer containing directory block */ struct inode *dp; /* inode for the directory being modified */ struct inode *ip; /* inode for directory entry being removed */ int isrmdir; /* indicates if doing RMDIR */ struct dirrem **prevdirremp; /* previously referenced inode, if any */ { int offset; ufs_lbn_t lbn; struct diradd *dap; struct dirrem *dirrem; struct pagedep *pagedep; struct jremref *jremref; struct jremref *dotremref; struct jremref *dotdotremref; struct vnode *dvp; struct ufsmount *ump; /* * Whiteouts have no deletion dependencies. */ if (ip == NULL) panic("newdirrem: whiteout"); dvp = ITOV(dp); ump = ITOUMP(dp); /* * If the system is over its limit and our filesystem is * responsible for more than our share of that usage and * we are not a snapshot, request some inodedep cleanup. * Limiting the number of dirrem structures will also limit * the number of freefile and freeblks structures. */ ACQUIRE_LOCK(ump); if (!IS_SNAPSHOT(ip) && softdep_excess_items(ump, D_DIRREM)) schedule_cleanup(UFSTOVFS(ump)); else FREE_LOCK(ump); dirrem = malloc(sizeof(struct dirrem), M_DIRREM, M_SOFTDEP_FLAGS | M_ZERO); workitem_alloc(&dirrem->dm_list, D_DIRREM, dvp->v_mount); LIST_INIT(&dirrem->dm_jremrefhd); LIST_INIT(&dirrem->dm_jwork); dirrem->dm_state = isrmdir ? RMDIR : 0; dirrem->dm_oldinum = ip->i_number; *prevdirremp = NULL; /* * Allocate remove reference structures to track journal write * dependencies. We will always have one for the link and * when doing directories we will always have one more for dot. * When renaming a directory we skip the dotdot link change so * this is not needed. */ jremref = dotremref = dotdotremref = NULL; if (DOINGSUJ(dvp)) { if (isrmdir) { jremref = newjremref(dirrem, dp, ip, dp->i_offset, ip->i_effnlink + 2); dotremref = newjremref(dirrem, ip, ip, DOT_OFFSET, ip->i_effnlink + 1); dotdotremref = newjremref(dirrem, ip, dp, DOTDOT_OFFSET, dp->i_effnlink + 1); dotdotremref->jr_state |= MKDIR_PARENT; } else jremref = newjremref(dirrem, dp, ip, dp->i_offset, ip->i_effnlink + 1); } ACQUIRE_LOCK(ump); lbn = lblkno(ump->um_fs, dp->i_offset); offset = blkoff(ump->um_fs, dp->i_offset); pagedep_lookup(UFSTOVFS(ump), bp, dp->i_number, lbn, DEPALLOC, &pagedep); dirrem->dm_pagedep = pagedep; dirrem->dm_offset = offset; /* * If we're renaming a .. link to a new directory, cancel any * existing MKDIR_PARENT mkdir. If it has already been canceled * the jremref is preserved for any potential diradd in this * location. This can not coincide with a rmdir. */ if (dp->i_offset == DOTDOT_OFFSET) { if (isrmdir) panic("newdirrem: .. directory change during remove?"); jremref = cancel_mkdir_dotdot(dp, dirrem, jremref); } /* * If we're removing a directory search for the .. dependency now and * cancel it. Any pending journal work will be added to the dirrem * to be completed when the workitem remove completes. */ if (isrmdir) dotdotremref = cancel_diradd_dotdot(ip, dirrem, dotdotremref); /* * Check for a diradd dependency for the same directory entry. * If present, then both dependencies become obsolete and can * be de-allocated. */ dap = diradd_lookup(pagedep, offset); if (dap == NULL) { /* * Link the jremref structures into the dirrem so they are * written prior to the pagedep. */ if (jremref) dirrem_journal(dirrem, jremref, dotremref, dotdotremref); return (dirrem); } /* * Must be ATTACHED at this point. */ if ((dap->da_state & ATTACHED) == 0) panic("newdirrem: not ATTACHED"); if (dap->da_newinum != ip->i_number) panic("newdirrem: inum %ju should be %ju", (uintmax_t)ip->i_number, (uintmax_t)dap->da_newinum); /* * If we are deleting a changed name that never made it to disk, * then return the dirrem describing the previous inode (which * represents the inode currently referenced from this entry on disk). */ if ((dap->da_state & DIRCHG) != 0) { *prevdirremp = dap->da_previous; dap->da_state &= ~DIRCHG; dap->da_pagedep = pagedep; } /* * We are deleting an entry that never made it to disk. * Mark it COMPLETE so we can delete its inode immediately. */ dirrem->dm_state |= COMPLETE; cancel_diradd(dap, dirrem, jremref, dotremref, dotdotremref); #ifdef INVARIANTS if (isrmdir == 0) { struct worklist *wk; LIST_FOREACH(wk, &dirrem->dm_jwork, wk_list) if (wk->wk_state & (MKDIR_BODY | MKDIR_PARENT)) panic("bad wk %p (0x%X)\n", wk, wk->wk_state); } #endif return (dirrem); } /* * Directory entry change dependencies. * * Changing an existing directory entry requires that an add operation * be completed first followed by a deletion. The semantics for the addition * are identical to the description of adding a new entry above except * that the rollback is to the old inode number rather than zero. Once * the addition dependency is completed, the removal is done as described * in the removal routine above. */ /* * This routine should be called immediately after changing * a directory entry. The inode's link count should not be * decremented by the calling procedure -- the soft updates * code will perform this task when it is safe. */ void softdep_setup_directory_change(bp, dp, ip, newinum, isrmdir) struct buf *bp; /* buffer containing directory block */ struct inode *dp; /* inode for the directory being modified */ struct inode *ip; /* inode for directory entry being removed */ ino_t newinum; /* new inode number for changed entry */ int isrmdir; /* indicates if doing RMDIR */ { int offset; struct diradd *dap = NULL; struct dirrem *dirrem, *prevdirrem; struct pagedep *pagedep; struct inodedep *inodedep; struct jaddref *jaddref; struct mount *mp; struct ufsmount *ump; mp = ITOVFS(dp); ump = VFSTOUFS(mp); offset = blkoff(ump->um_fs, dp->i_offset); KASSERT(MOUNTEDSOFTDEP(mp) != 0, ("softdep_setup_directory_change called on non-softdep filesystem")); /* * Whiteouts do not need diradd dependencies. */ if (newinum != UFS_WINO) { dap = malloc(sizeof(struct diradd), M_DIRADD, M_SOFTDEP_FLAGS|M_ZERO); workitem_alloc(&dap->da_list, D_DIRADD, mp); dap->da_state = DIRCHG | ATTACHED | DEPCOMPLETE; dap->da_offset = offset; dap->da_newinum = newinum; LIST_INIT(&dap->da_jwork); } /* * Allocate a new dirrem and ACQUIRE_LOCK. */ dirrem = newdirrem(bp, dp, ip, isrmdir, &prevdirrem); pagedep = dirrem->dm_pagedep; /* * The possible values for isrmdir: * 0 - non-directory file rename * 1 - directory rename within same directory * inum - directory rename to new directory of given inode number * When renaming to a new directory, we are both deleting and * creating a new directory entry, so the link count on the new * directory should not change. Thus we do not need the followup * dirrem which is usually done in handle_workitem_remove. We set * the DIRCHG flag to tell handle_workitem_remove to skip the * followup dirrem. */ if (isrmdir > 1) dirrem->dm_state |= DIRCHG; /* * Whiteouts have no additional dependencies, * so just put the dirrem on the correct list. */ if (newinum == UFS_WINO) { if ((dirrem->dm_state & COMPLETE) == 0) { LIST_INSERT_HEAD(&pagedep->pd_dirremhd, dirrem, dm_next); } else { dirrem->dm_dirinum = pagedep->pd_ino; if (LIST_EMPTY(&dirrem->dm_jremrefhd)) add_to_worklist(&dirrem->dm_list, 0); } FREE_LOCK(ump); return; } /* * Add the dirrem to the inodedep's pending remove list for quick * discovery later. A valid nlinkdelta ensures that this lookup * will not fail. */ if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0) panic("softdep_setup_directory_change: Lost inodedep."); dirrem->dm_state |= ONDEPLIST; LIST_INSERT_HEAD(&inodedep->id_dirremhd, dirrem, dm_inonext); /* * If the COMPLETE flag is clear, then there were no active * entries and we want to roll back to the previous inode until * the new inode is committed to disk. If the COMPLETE flag is * set, then we have deleted an entry that never made it to disk. * If the entry we deleted resulted from a name change, then the old * inode reference still resides on disk. Any rollback that we do * needs to be to that old inode (returned to us in prevdirrem). If * the entry we deleted resulted from a create, then there is * no entry on the disk, so we want to roll back to zero rather * than the uncommitted inode. In either of the COMPLETE cases we * want to immediately free the unwritten and unreferenced inode. */ if ((dirrem->dm_state & COMPLETE) == 0) { dap->da_previous = dirrem; } else { if (prevdirrem != NULL) { dap->da_previous = prevdirrem; } else { dap->da_state &= ~DIRCHG; dap->da_pagedep = pagedep; } dirrem->dm_dirinum = pagedep->pd_ino; if (LIST_EMPTY(&dirrem->dm_jremrefhd)) add_to_worklist(&dirrem->dm_list, 0); } /* * Lookup the jaddref for this journal entry. We must finish * initializing it and make the diradd write dependent on it. * If we're not journaling, put it on the id_bufwait list if the * inode is not yet written. If it is written, do the post-inode * write processing to put it on the id_pendinghd list. */ inodedep_lookup(mp, newinum, DEPALLOC, &inodedep); if (MOUNTEDSUJ(mp)) { jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst, inoreflst); KASSERT(jaddref != NULL && jaddref->ja_parent == dp->i_number, ("softdep_setup_directory_change: bad jaddref %p", jaddref)); jaddref->ja_diroff = dp->i_offset; jaddref->ja_diradd = dap; LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)], dap, da_pdlist); add_to_journal(&jaddref->ja_list); } else if ((inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) { dap->da_state |= COMPLETE; LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist); WORKLIST_INSERT(&inodedep->id_pendinghd, &dap->da_list); } else { LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)], dap, da_pdlist); WORKLIST_INSERT(&inodedep->id_bufwait, &dap->da_list); } /* * If we're making a new name for a directory that has not been * committed when need to move the dot and dotdot references to * this new name. */ if (inodedep->id_mkdiradd && dp->i_offset != DOTDOT_OFFSET) merge_diradd(inodedep, dap); FREE_LOCK(ump); } /* * Called whenever the link count on an inode is changed. * It creates an inode dependency so that the new reference(s) * to the inode cannot be committed to disk until the updated * inode has been written. */ void softdep_change_linkcnt(ip) struct inode *ip; /* the inode with the increased link count */ { struct inodedep *inodedep; struct ufsmount *ump; ump = ITOUMP(ip); KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(ump)) != 0, ("softdep_change_linkcnt called on non-softdep filesystem")); ACQUIRE_LOCK(ump); inodedep_lookup(UFSTOVFS(ump), ip->i_number, DEPALLOC, &inodedep); if (ip->i_nlink < ip->i_effnlink) panic("softdep_change_linkcnt: bad delta"); inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink; FREE_LOCK(ump); } /* * Attach a sbdep dependency to the superblock buf so that we can keep * track of the head of the linked list of referenced but unlinked inodes. */ void softdep_setup_sbupdate(ump, fs, bp) struct ufsmount *ump; struct fs *fs; struct buf *bp; { struct sbdep *sbdep; struct worklist *wk; KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(ump)) != 0, ("softdep_setup_sbupdate called on non-softdep filesystem")); LIST_FOREACH(wk, &bp->b_dep, wk_list) if (wk->wk_type == D_SBDEP) break; if (wk != NULL) return; sbdep = malloc(sizeof(struct sbdep), M_SBDEP, M_SOFTDEP_FLAGS); workitem_alloc(&sbdep->sb_list, D_SBDEP, UFSTOVFS(ump)); sbdep->sb_fs = fs; sbdep->sb_ump = ump; ACQUIRE_LOCK(ump); WORKLIST_INSERT(&bp->b_dep, &sbdep->sb_list); FREE_LOCK(ump); } /* * Return the first unlinked inodedep which is ready to be the head of the * list. The inodedep and all those after it must have valid next pointers. */ static struct inodedep * first_unlinked_inodedep(ump) struct ufsmount *ump; { struct inodedep *inodedep; struct inodedep *idp; LOCK_OWNED(ump); for (inodedep = TAILQ_LAST(&ump->softdep_unlinked, inodedeplst); inodedep; inodedep = idp) { if ((inodedep->id_state & UNLINKNEXT) == 0) return (NULL); idp = TAILQ_PREV(inodedep, inodedeplst, id_unlinked); if (idp == NULL || (idp->id_state & UNLINKNEXT) == 0) break; if ((inodedep->id_state & UNLINKPREV) == 0) break; } return (inodedep); } /* * Set the sujfree unlinked head pointer prior to writing a superblock. */ static void initiate_write_sbdep(sbdep) struct sbdep *sbdep; { struct inodedep *inodedep; struct fs *bpfs; struct fs *fs; bpfs = sbdep->sb_fs; fs = sbdep->sb_ump->um_fs; inodedep = first_unlinked_inodedep(sbdep->sb_ump); if (inodedep) { fs->fs_sujfree = inodedep->id_ino; inodedep->id_state |= UNLINKPREV; } else fs->fs_sujfree = 0; bpfs->fs_sujfree = fs->fs_sujfree; /* * Because we have made changes to the superblock, we need to * recompute its check-hash. */ bpfs->fs_ckhash = ffs_calc_sbhash(bpfs); } /* * After a superblock is written determine whether it must be written again * due to a changing unlinked list head. */ static int handle_written_sbdep(sbdep, bp) struct sbdep *sbdep; struct buf *bp; { struct inodedep *inodedep; struct fs *fs; LOCK_OWNED(sbdep->sb_ump); fs = sbdep->sb_fs; /* * If the superblock doesn't match the in-memory list start over. */ inodedep = first_unlinked_inodedep(sbdep->sb_ump); if ((inodedep && fs->fs_sujfree != inodedep->id_ino) || (inodedep == NULL && fs->fs_sujfree != 0)) { bdirty(bp); return (1); } WORKITEM_FREE(sbdep, D_SBDEP); if (fs->fs_sujfree == 0) return (0); /* * Now that we have a record of this inode in stable store allow it * to be written to free up pending work. Inodes may see a lot of * write activity after they are unlinked which we must not hold up. */ for (; inodedep != NULL; inodedep = TAILQ_NEXT(inodedep, id_unlinked)) { if ((inodedep->id_state & UNLINKLINKS) != UNLINKLINKS) panic("handle_written_sbdep: Bad inodedep %p (0x%X)", inodedep, inodedep->id_state); if (inodedep->id_state & UNLINKONLIST) break; inodedep->id_state |= DEPCOMPLETE | UNLINKONLIST; } return (0); } /* * Mark an inodedep as unlinked and insert it into the in-memory unlinked list. */ static void unlinked_inodedep(mp, inodedep) struct mount *mp; struct inodedep *inodedep; { struct ufsmount *ump; ump = VFSTOUFS(mp); LOCK_OWNED(ump); if (MOUNTEDSUJ(mp) == 0) return; ump->um_fs->fs_fmod = 1; if (inodedep->id_state & UNLINKED) panic("unlinked_inodedep: %p already unlinked\n", inodedep); inodedep->id_state |= UNLINKED; TAILQ_INSERT_HEAD(&ump->softdep_unlinked, inodedep, id_unlinked); } /* * Remove an inodedep from the unlinked inodedep list. This may require * disk writes if the inode has made it that far. */ static void clear_unlinked_inodedep(inodedep) struct inodedep *inodedep; { struct ufs2_dinode *dip; struct ufsmount *ump; struct inodedep *idp; struct inodedep *idn; struct fs *fs, *bpfs; struct buf *bp; daddr_t dbn; ino_t ino; ino_t nino; ino_t pino; int error; ump = VFSTOUFS(inodedep->id_list.wk_mp); fs = ump->um_fs; ino = inodedep->id_ino; error = 0; for (;;) { LOCK_OWNED(ump); KASSERT((inodedep->id_state & UNLINKED) != 0, ("clear_unlinked_inodedep: inodedep %p not unlinked", inodedep)); /* * If nothing has yet been written simply remove us from * the in memory list and return. This is the most common * case where handle_workitem_remove() loses the final * reference. */ if ((inodedep->id_state & UNLINKLINKS) == 0) break; /* * If we have a NEXT pointer and no PREV pointer we can simply * clear NEXT's PREV and remove ourselves from the list. Be * careful not to clear PREV if the superblock points at * next as well. */ idn = TAILQ_NEXT(inodedep, id_unlinked); if ((inodedep->id_state & UNLINKLINKS) == UNLINKNEXT) { if (idn && fs->fs_sujfree != idn->id_ino) idn->id_state &= ~UNLINKPREV; break; } /* * Here we have an inodedep which is actually linked into * the list. We must remove it by forcing a write to the * link before us, whether it be the superblock or an inode. * Unfortunately the list may change while we're waiting * on the buf lock for either resource so we must loop until * we lock the right one. If both the superblock and an * inode point to this inode we must clear the inode first * followed by the superblock. */ idp = TAILQ_PREV(inodedep, inodedeplst, id_unlinked); pino = 0; if (idp && (idp->id_state & UNLINKNEXT)) pino = idp->id_ino; FREE_LOCK(ump); if (pino == 0) { bp = getblk(ump->um_devvp, btodb(fs->fs_sblockloc), (int)fs->fs_sbsize, 0, 0, 0); } else { dbn = fsbtodb(fs, ino_to_fsba(fs, pino)); error = ffs_breadz(ump, ump->um_devvp, dbn, dbn, (int)fs->fs_bsize, NULL, NULL, 0, NOCRED, 0, NULL, &bp); } ACQUIRE_LOCK(ump); if (error) break; /* If the list has changed restart the loop. */ idp = TAILQ_PREV(inodedep, inodedeplst, id_unlinked); nino = 0; if (idp && (idp->id_state & UNLINKNEXT)) nino = idp->id_ino; if (nino != pino || (inodedep->id_state & UNLINKPREV) != UNLINKPREV) { FREE_LOCK(ump); brelse(bp); ACQUIRE_LOCK(ump); continue; } nino = 0; idn = TAILQ_NEXT(inodedep, id_unlinked); if (idn) nino = idn->id_ino; /* * Remove us from the in memory list. After this we cannot * access the inodedep. */ KASSERT((inodedep->id_state & UNLINKED) != 0, ("clear_unlinked_inodedep: inodedep %p not unlinked", inodedep)); inodedep->id_state &= ~(UNLINKED | UNLINKLINKS | UNLINKONLIST); TAILQ_REMOVE(&ump->softdep_unlinked, inodedep, id_unlinked); FREE_LOCK(ump); /* * The predecessor's next pointer is manually updated here * so that the NEXT flag is never cleared for an element * that is in the list. */ if (pino == 0) { bcopy((caddr_t)fs, bp->b_data, (u_int)fs->fs_sbsize); bpfs = (struct fs *)bp->b_data; ffs_oldfscompat_write(bpfs, ump); softdep_setup_sbupdate(ump, bpfs, bp); /* * Because we may have made changes to the superblock, * we need to recompute its check-hash. */ bpfs->fs_ckhash = ffs_calc_sbhash(bpfs); } else if (fs->fs_magic == FS_UFS1_MAGIC) { ((struct ufs1_dinode *)bp->b_data + ino_to_fsbo(fs, pino))->di_freelink = nino; } else { dip = (struct ufs2_dinode *)bp->b_data + ino_to_fsbo(fs, pino); dip->di_freelink = nino; ffs_update_dinode_ckhash(fs, dip); } /* * If the bwrite fails we have no recourse to recover. The * filesystem is corrupted already. */ bwrite(bp); ACQUIRE_LOCK(ump); /* * If the superblock pointer still needs to be cleared force * a write here. */ if (fs->fs_sujfree == ino) { FREE_LOCK(ump); bp = getblk(ump->um_devvp, btodb(fs->fs_sblockloc), (int)fs->fs_sbsize, 0, 0, 0); bcopy((caddr_t)fs, bp->b_data, (u_int)fs->fs_sbsize); bpfs = (struct fs *)bp->b_data; ffs_oldfscompat_write(bpfs, ump); softdep_setup_sbupdate(ump, bpfs, bp); /* * Because we may have made changes to the superblock, * we need to recompute its check-hash. */ bpfs->fs_ckhash = ffs_calc_sbhash(bpfs); bwrite(bp); ACQUIRE_LOCK(ump); } if (fs->fs_sujfree != ino) return; panic("clear_unlinked_inodedep: Failed to clear free head"); } if (inodedep->id_ino == fs->fs_sujfree) panic("clear_unlinked_inodedep: Freeing head of free list"); inodedep->id_state &= ~(UNLINKED | UNLINKLINKS | UNLINKONLIST); TAILQ_REMOVE(&ump->softdep_unlinked, inodedep, id_unlinked); return; } /* * This workitem decrements the inode's link count. * If the link count reaches zero, the file is removed. */ static int handle_workitem_remove(dirrem, flags) struct dirrem *dirrem; int flags; { struct inodedep *inodedep; struct workhead dotdotwk; struct worklist *wk; struct ufsmount *ump; struct mount *mp; struct vnode *vp; struct inode *ip; ino_t oldinum; if (dirrem->dm_state & ONWORKLIST) panic("handle_workitem_remove: dirrem %p still on worklist", dirrem); oldinum = dirrem->dm_oldinum; mp = dirrem->dm_list.wk_mp; ump = VFSTOUFS(mp); flags |= LK_EXCLUSIVE; if (ffs_vgetf(mp, oldinum, flags, &vp, FFSV_FORCEINSMQ) != 0) return (EBUSY); ip = VTOI(vp); MPASS(ip->i_mode != 0); ACQUIRE_LOCK(ump); if ((inodedep_lookup(mp, oldinum, 0, &inodedep)) == 0) panic("handle_workitem_remove: lost inodedep"); if (dirrem->dm_state & ONDEPLIST) LIST_REMOVE(dirrem, dm_inonext); KASSERT(LIST_EMPTY(&dirrem->dm_jremrefhd), ("handle_workitem_remove: Journal entries not written.")); /* * Move all dependencies waiting on the remove to complete * from the dirrem to the inode inowait list to be completed * after the inode has been updated and written to disk. * * Any marked MKDIR_PARENT are saved to be completed when the * dotdot ref is removed unless DIRCHG is specified. For * directory change operations there will be no further * directory writes and the jsegdeps need to be moved along * with the rest to be completed when the inode is free or * stable in the inode free list. */ LIST_INIT(&dotdotwk); while ((wk = LIST_FIRST(&dirrem->dm_jwork)) != NULL) { WORKLIST_REMOVE(wk); if ((dirrem->dm_state & DIRCHG) == 0 && wk->wk_state & MKDIR_PARENT) { wk->wk_state &= ~MKDIR_PARENT; WORKLIST_INSERT(&dotdotwk, wk); continue; } WORKLIST_INSERT(&inodedep->id_inowait, wk); } LIST_SWAP(&dirrem->dm_jwork, &dotdotwk, worklist, wk_list); /* * Normal file deletion. */ if ((dirrem->dm_state & RMDIR) == 0) { ip->i_nlink--; KASSERT(ip->i_nlink >= 0, ("handle_workitem_remove: file ino " "%ju negative i_nlink %d", (intmax_t)ip->i_number, ip->i_nlink)); DIP_SET(ip, i_nlink, ip->i_nlink); UFS_INODE_SET_FLAG(ip, IN_CHANGE); if (ip->i_nlink < ip->i_effnlink) panic("handle_workitem_remove: bad file delta"); if (ip->i_nlink == 0) unlinked_inodedep(mp, inodedep); inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink; KASSERT(LIST_EMPTY(&dirrem->dm_jwork), ("handle_workitem_remove: worklist not empty. %s", TYPENAME(LIST_FIRST(&dirrem->dm_jwork)->wk_type))); WORKITEM_FREE(dirrem, D_DIRREM); FREE_LOCK(ump); goto out; } /* * Directory deletion. Decrement reference count for both the * just deleted parent directory entry and the reference for ".". * Arrange to have the reference count on the parent decremented * to account for the loss of "..". */ ip->i_nlink -= 2; KASSERT(ip->i_nlink >= 0, ("handle_workitem_remove: directory ino " "%ju negative i_nlink %d", (intmax_t)ip->i_number, ip->i_nlink)); DIP_SET(ip, i_nlink, ip->i_nlink); UFS_INODE_SET_FLAG(ip, IN_CHANGE); if (ip->i_nlink < ip->i_effnlink) panic("handle_workitem_remove: bad dir delta"); if (ip->i_nlink == 0) unlinked_inodedep(mp, inodedep); inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink; /* * Rename a directory to a new parent. Since, we are both deleting * and creating a new directory entry, the link count on the new * directory should not change. Thus we skip the followup dirrem. */ if (dirrem->dm_state & DIRCHG) { KASSERT(LIST_EMPTY(&dirrem->dm_jwork), ("handle_workitem_remove: DIRCHG and worklist not empty.")); WORKITEM_FREE(dirrem, D_DIRREM); FREE_LOCK(ump); goto out; } dirrem->dm_state = ONDEPLIST; dirrem->dm_oldinum = dirrem->dm_dirinum; /* * Place the dirrem on the parent's diremhd list. */ if (inodedep_lookup(mp, dirrem->dm_oldinum, 0, &inodedep) == 0) panic("handle_workitem_remove: lost dir inodedep"); LIST_INSERT_HEAD(&inodedep->id_dirremhd, dirrem, dm_inonext); /* * If the allocated inode has never been written to disk, then * the on-disk inode is zero'ed and we can remove the file * immediately. When journaling if the inode has been marked * unlinked and not DEPCOMPLETE we know it can never be written. */ inodedep_lookup(mp, oldinum, 0, &inodedep); if (inodedep == NULL || (inodedep->id_state & (DEPCOMPLETE | UNLINKED)) == UNLINKED || check_inode_unwritten(inodedep)) { FREE_LOCK(ump); vput(vp); return handle_workitem_remove(dirrem, flags); } WORKLIST_INSERT(&inodedep->id_inowait, &dirrem->dm_list); FREE_LOCK(ump); UFS_INODE_SET_FLAG(ip, IN_CHANGE); out: ffs_update(vp, 0); vput(vp); return (0); } /* * Inode de-allocation dependencies. * * When an inode's link count is reduced to zero, it can be de-allocated. We * found it convenient to postpone de-allocation until after the inode is * written to disk with its new link count (zero). At this point, all of the * on-disk inode's block pointers are nullified and, with careful dependency * list ordering, all dependencies related to the inode will be satisfied and * the corresponding dependency structures de-allocated. So, if/when the * inode is reused, there will be no mixing of old dependencies with new * ones. This artificial dependency is set up by the block de-allocation * procedure above (softdep_setup_freeblocks) and completed by the * following procedure. */ static void handle_workitem_freefile(freefile) struct freefile *freefile; { struct workhead wkhd; struct fs *fs; struct ufsmount *ump; int error; #ifdef INVARIANTS struct inodedep *idp; #endif ump = VFSTOUFS(freefile->fx_list.wk_mp); fs = ump->um_fs; #ifdef INVARIANTS ACQUIRE_LOCK(ump); error = inodedep_lookup(UFSTOVFS(ump), freefile->fx_oldinum, 0, &idp); FREE_LOCK(ump); if (error) panic("handle_workitem_freefile: inodedep %p survived", idp); #endif UFS_LOCK(ump); fs->fs_pendinginodes -= 1; UFS_UNLOCK(ump); LIST_INIT(&wkhd); LIST_SWAP(&freefile->fx_jwork, &wkhd, worklist, wk_list); if ((error = ffs_freefile(ump, fs, freefile->fx_devvp, freefile->fx_oldinum, freefile->fx_mode, &wkhd)) != 0) softdep_error("handle_workitem_freefile", error); ACQUIRE_LOCK(ump); WORKITEM_FREE(freefile, D_FREEFILE); FREE_LOCK(ump); } /* * Helper function which unlinks marker element from work list and returns * the next element on the list. */ static __inline struct worklist * markernext(struct worklist *marker) { struct worklist *next; next = LIST_NEXT(marker, wk_list); LIST_REMOVE(marker, wk_list); return next; } /* * Disk writes. * * The dependency structures constructed above are most actively used when file * system blocks are written to disk. No constraints are placed on when a * block can be written, but unsatisfied update dependencies are made safe by * modifying (or replacing) the source memory for the duration of the disk * write. When the disk write completes, the memory block is again brought * up-to-date. * * In-core inode structure reclamation. * * Because there are a finite number of "in-core" inode structures, they are * reused regularly. By transferring all inode-related dependencies to the * in-memory inode block and indexing them separately (via "inodedep"s), we * can allow "in-core" inode structures to be reused at any time and avoid * any increase in contention. * * Called just before entering the device driver to initiate a new disk I/O. * The buffer must be locked, thus, no I/O completion operations can occur * while we are manipulating its associated dependencies. */ static void softdep_disk_io_initiation(bp) struct buf *bp; /* structure describing disk write to occur */ { struct worklist *wk; struct worklist marker; struct inodedep *inodedep; struct freeblks *freeblks; struct jblkdep *jblkdep; struct newblk *newblk; struct ufsmount *ump; /* * We only care about write operations. There should never * be dependencies for reads. */ if (bp->b_iocmd != BIO_WRITE) panic("softdep_disk_io_initiation: not write"); if (bp->b_vflags & BV_BKGRDINPROG) panic("softdep_disk_io_initiation: Writing buffer with " "background write in progress: %p", bp); ump = softdep_bp_to_mp(bp); if (ump == NULL) return; marker.wk_type = D_LAST + 1; /* Not a normal workitem */ PHOLD(curproc); /* Don't swap out kernel stack */ ACQUIRE_LOCK(ump); /* * Do any necessary pre-I/O processing. */ for (wk = LIST_FIRST(&bp->b_dep); wk != NULL; wk = markernext(&marker)) { LIST_INSERT_AFTER(wk, &marker, wk_list); switch (wk->wk_type) { case D_PAGEDEP: initiate_write_filepage(WK_PAGEDEP(wk), bp); continue; case D_INODEDEP: inodedep = WK_INODEDEP(wk); if (inodedep->id_fs->fs_magic == FS_UFS1_MAGIC) initiate_write_inodeblock_ufs1(inodedep, bp); else initiate_write_inodeblock_ufs2(inodedep, bp); continue; case D_INDIRDEP: initiate_write_indirdep(WK_INDIRDEP(wk), bp); continue; case D_BMSAFEMAP: initiate_write_bmsafemap(WK_BMSAFEMAP(wk), bp); continue; case D_JSEG: WK_JSEG(wk)->js_buf = NULL; continue; case D_FREEBLKS: freeblks = WK_FREEBLKS(wk); jblkdep = LIST_FIRST(&freeblks->fb_jblkdephd); /* * We have to wait for the freeblks to be journaled * before we can write an inodeblock with updated * pointers. Be careful to arrange the marker so * we revisit the freeblks if it's not removed by * the first jwait(). */ if (jblkdep != NULL) { LIST_REMOVE(&marker, wk_list); LIST_INSERT_BEFORE(wk, &marker, wk_list); jwait(&jblkdep->jb_list, MNT_WAIT); } continue; case D_ALLOCDIRECT: case D_ALLOCINDIR: /* * We have to wait for the jnewblk to be journaled * before we can write to a block if the contents * may be confused with an earlier file's indirect * at recovery time. Handle the marker as described * above. */ newblk = WK_NEWBLK(wk); if (newblk->nb_jnewblk != NULL && indirblk_lookup(newblk->nb_list.wk_mp, newblk->nb_newblkno)) { LIST_REMOVE(&marker, wk_list); LIST_INSERT_BEFORE(wk, &marker, wk_list); jwait(&newblk->nb_jnewblk->jn_list, MNT_WAIT); } continue; case D_SBDEP: initiate_write_sbdep(WK_SBDEP(wk)); continue; case D_MKDIR: case D_FREEWORK: case D_FREEDEP: case D_JSEGDEP: continue; default: panic("handle_disk_io_initiation: Unexpected type %s", TYPENAME(wk->wk_type)); /* NOTREACHED */ } } FREE_LOCK(ump); PRELE(curproc); /* Allow swapout of kernel stack */ } /* * Called from within the procedure above to deal with unsatisfied * allocation dependencies in a directory. The buffer must be locked, * thus, no I/O completion operations can occur while we are * manipulating its associated dependencies. */ static void initiate_write_filepage(pagedep, bp) struct pagedep *pagedep; struct buf *bp; { struct jremref *jremref; struct jmvref *jmvref; struct dirrem *dirrem; struct diradd *dap; struct direct *ep; int i; if (pagedep->pd_state & IOSTARTED) { /* * This can only happen if there is a driver that does not * understand chaining. Here biodone will reissue the call * to strategy for the incomplete buffers. */ printf("initiate_write_filepage: already started\n"); return; } pagedep->pd_state |= IOSTARTED; /* * Wait for all journal remove dependencies to hit the disk. * We can not allow any potentially conflicting directory adds * to be visible before removes and rollback is too difficult. * The per-filesystem lock may be dropped and re-acquired, however * we hold the buf locked so the dependency can not go away. */ LIST_FOREACH(dirrem, &pagedep->pd_dirremhd, dm_next) while ((jremref = LIST_FIRST(&dirrem->dm_jremrefhd)) != NULL) jwait(&jremref->jr_list, MNT_WAIT); while ((jmvref = LIST_FIRST(&pagedep->pd_jmvrefhd)) != NULL) jwait(&jmvref->jm_list, MNT_WAIT); for (i = 0; i < DAHASHSZ; i++) { LIST_FOREACH(dap, &pagedep->pd_diraddhd[i], da_pdlist) { ep = (struct direct *) ((char *)bp->b_data + dap->da_offset); if (ep->d_ino != dap->da_newinum) panic("%s: dir inum %ju != new %ju", "initiate_write_filepage", (uintmax_t)ep->d_ino, (uintmax_t)dap->da_newinum); if (dap->da_state & DIRCHG) ep->d_ino = dap->da_previous->dm_oldinum; else ep->d_ino = 0; dap->da_state &= ~ATTACHED; dap->da_state |= UNDONE; } } } /* * Version of initiate_write_inodeblock that handles UFS1 dinodes. * Note that any bug fixes made to this routine must be done in the * version found below. * * Called from within the procedure above to deal with unsatisfied * allocation dependencies in an inodeblock. The buffer must be * locked, thus, no I/O completion operations can occur while we * are manipulating its associated dependencies. */ static void initiate_write_inodeblock_ufs1(inodedep, bp) struct inodedep *inodedep; struct buf *bp; /* The inode block */ { struct allocdirect *adp, *lastadp; struct ufs1_dinode *dp; struct ufs1_dinode *sip; struct inoref *inoref; struct ufsmount *ump; struct fs *fs; ufs_lbn_t i; #ifdef INVARIANTS ufs_lbn_t prevlbn = 0; #endif int deplist; if (inodedep->id_state & IOSTARTED) panic("initiate_write_inodeblock_ufs1: already started"); inodedep->id_state |= IOSTARTED; fs = inodedep->id_fs; ump = VFSTOUFS(inodedep->id_list.wk_mp); LOCK_OWNED(ump); dp = (struct ufs1_dinode *)bp->b_data + ino_to_fsbo(fs, inodedep->id_ino); /* * If we're on the unlinked list but have not yet written our * next pointer initialize it here. */ if ((inodedep->id_state & (UNLINKED | UNLINKNEXT)) == UNLINKED) { struct inodedep *inon; inon = TAILQ_NEXT(inodedep, id_unlinked); dp->di_freelink = inon ? inon->id_ino : 0; } /* * If the bitmap is not yet written, then the allocated * inode cannot be written to disk. */ if ((inodedep->id_state & DEPCOMPLETE) == 0) { if (inodedep->id_savedino1 != NULL) panic("initiate_write_inodeblock_ufs1: I/O underway"); FREE_LOCK(ump); sip = malloc(sizeof(struct ufs1_dinode), M_SAVEDINO, M_SOFTDEP_FLAGS); ACQUIRE_LOCK(ump); inodedep->id_savedino1 = sip; *inodedep->id_savedino1 = *dp; bzero((caddr_t)dp, sizeof(struct ufs1_dinode)); dp->di_gen = inodedep->id_savedino1->di_gen; dp->di_freelink = inodedep->id_savedino1->di_freelink; return; } /* * If no dependencies, then there is nothing to roll back. */ inodedep->id_savedsize = dp->di_size; inodedep->id_savedextsize = 0; inodedep->id_savednlink = dp->di_nlink; if (TAILQ_EMPTY(&inodedep->id_inoupdt) && TAILQ_EMPTY(&inodedep->id_inoreflst)) return; /* * Revert the link count to that of the first unwritten journal entry. */ inoref = TAILQ_FIRST(&inodedep->id_inoreflst); if (inoref) dp->di_nlink = inoref->if_nlink; /* * Set the dependencies to busy. */ for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; adp = TAILQ_NEXT(adp, ad_next)) { #ifdef INVARIANTS if (deplist != 0 && prevlbn >= adp->ad_offset) panic("softdep_write_inodeblock: lbn order"); prevlbn = adp->ad_offset; if (adp->ad_offset < UFS_NDADDR && dp->di_db[adp->ad_offset] != adp->ad_newblkno) panic("initiate_write_inodeblock_ufs1: " "direct pointer #%jd mismatch %d != %jd", (intmax_t)adp->ad_offset, dp->di_db[adp->ad_offset], (intmax_t)adp->ad_newblkno); if (adp->ad_offset >= UFS_NDADDR && dp->di_ib[adp->ad_offset - UFS_NDADDR] != adp->ad_newblkno) panic("initiate_write_inodeblock_ufs1: " "indirect pointer #%jd mismatch %d != %jd", (intmax_t)adp->ad_offset - UFS_NDADDR, dp->di_ib[adp->ad_offset - UFS_NDADDR], (intmax_t)adp->ad_newblkno); deplist |= 1 << adp->ad_offset; if ((adp->ad_state & ATTACHED) == 0) panic("initiate_write_inodeblock_ufs1: " "Unknown state 0x%x", adp->ad_state); #endif /* INVARIANTS */ adp->ad_state &= ~ATTACHED; adp->ad_state |= UNDONE; } /* * The on-disk inode cannot claim to be any larger than the last * fragment that has been written. Otherwise, the on-disk inode * might have fragments that were not the last block in the file * which would corrupt the filesystem. */ for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) { if (adp->ad_offset >= UFS_NDADDR) break; dp->di_db[adp->ad_offset] = adp->ad_oldblkno; /* keep going until hitting a rollback to a frag */ if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize) continue; dp->di_size = fs->fs_bsize * adp->ad_offset + adp->ad_oldsize; for (i = adp->ad_offset + 1; i < UFS_NDADDR; i++) { #ifdef INVARIANTS if (dp->di_db[i] != 0 && (deplist & (1 << i)) == 0) panic("initiate_write_inodeblock_ufs1: " "lost dep1"); #endif /* INVARIANTS */ dp->di_db[i] = 0; } for (i = 0; i < UFS_NIADDR; i++) { #ifdef INVARIANTS if (dp->di_ib[i] != 0 && (deplist & ((1 << UFS_NDADDR) << i)) == 0) panic("initiate_write_inodeblock_ufs1: " "lost dep2"); #endif /* INVARIANTS */ dp->di_ib[i] = 0; } return; } /* * If we have zero'ed out the last allocated block of the file, * roll back the size to the last currently allocated block. * We know that this last allocated block is a full-sized as * we already checked for fragments in the loop above. */ if (lastadp != NULL && dp->di_size <= (lastadp->ad_offset + 1) * fs->fs_bsize) { for (i = lastadp->ad_offset; i >= 0; i--) if (dp->di_db[i] != 0) break; dp->di_size = (i + 1) * fs->fs_bsize; } /* * The only dependencies are for indirect blocks. * * The file size for indirect block additions is not guaranteed. * Such a guarantee would be non-trivial to achieve. The conventional * synchronous write implementation also does not make this guarantee. * Fsck should catch and fix discrepancies. Arguably, the file size * can be over-estimated without destroying integrity when the file * moves into the indirect blocks (i.e., is large). If we want to * postpone fsck, we are stuck with this argument. */ for (; adp; adp = TAILQ_NEXT(adp, ad_next)) dp->di_ib[adp->ad_offset - UFS_NDADDR] = 0; } /* * Version of initiate_write_inodeblock that handles UFS2 dinodes. * Note that any bug fixes made to this routine must be done in the * version found above. * * Called from within the procedure above to deal with unsatisfied * allocation dependencies in an inodeblock. The buffer must be * locked, thus, no I/O completion operations can occur while we * are manipulating its associated dependencies. */ static void initiate_write_inodeblock_ufs2(inodedep, bp) struct inodedep *inodedep; struct buf *bp; /* The inode block */ { struct allocdirect *adp, *lastadp; struct ufs2_dinode *dp; struct ufs2_dinode *sip; struct inoref *inoref; struct ufsmount *ump; struct fs *fs; ufs_lbn_t i; #ifdef INVARIANTS ufs_lbn_t prevlbn = 0; #endif int deplist; if (inodedep->id_state & IOSTARTED) panic("initiate_write_inodeblock_ufs2: already started"); inodedep->id_state |= IOSTARTED; fs = inodedep->id_fs; ump = VFSTOUFS(inodedep->id_list.wk_mp); LOCK_OWNED(ump); dp = (struct ufs2_dinode *)bp->b_data + ino_to_fsbo(fs, inodedep->id_ino); /* * If we're on the unlinked list but have not yet written our * next pointer initialize it here. */ if ((inodedep->id_state & (UNLINKED | UNLINKNEXT)) == UNLINKED) { struct inodedep *inon; inon = TAILQ_NEXT(inodedep, id_unlinked); dp->di_freelink = inon ? inon->id_ino : 0; ffs_update_dinode_ckhash(fs, dp); } /* * If the bitmap is not yet written, then the allocated * inode cannot be written to disk. */ if ((inodedep->id_state & DEPCOMPLETE) == 0) { if (inodedep->id_savedino2 != NULL) panic("initiate_write_inodeblock_ufs2: I/O underway"); FREE_LOCK(ump); sip = malloc(sizeof(struct ufs2_dinode), M_SAVEDINO, M_SOFTDEP_FLAGS); ACQUIRE_LOCK(ump); inodedep->id_savedino2 = sip; *inodedep->id_savedino2 = *dp; bzero((caddr_t)dp, sizeof(struct ufs2_dinode)); dp->di_gen = inodedep->id_savedino2->di_gen; dp->di_freelink = inodedep->id_savedino2->di_freelink; return; } /* * If no dependencies, then there is nothing to roll back. */ inodedep->id_savedsize = dp->di_size; inodedep->id_savedextsize = dp->di_extsize; inodedep->id_savednlink = dp->di_nlink; if (TAILQ_EMPTY(&inodedep->id_inoupdt) && TAILQ_EMPTY(&inodedep->id_extupdt) && TAILQ_EMPTY(&inodedep->id_inoreflst)) return; /* * Revert the link count to that of the first unwritten journal entry. */ inoref = TAILQ_FIRST(&inodedep->id_inoreflst); if (inoref) dp->di_nlink = inoref->if_nlink; /* * Set the ext data dependencies to busy. */ for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_extupdt); adp; adp = TAILQ_NEXT(adp, ad_next)) { #ifdef INVARIANTS if (deplist != 0 && prevlbn >= adp->ad_offset) panic("initiate_write_inodeblock_ufs2: lbn order"); prevlbn = adp->ad_offset; if (dp->di_extb[adp->ad_offset] != adp->ad_newblkno) panic("initiate_write_inodeblock_ufs2: " "ext pointer #%jd mismatch %jd != %jd", (intmax_t)adp->ad_offset, (intmax_t)dp->di_extb[adp->ad_offset], (intmax_t)adp->ad_newblkno); deplist |= 1 << adp->ad_offset; if ((adp->ad_state & ATTACHED) == 0) panic("initiate_write_inodeblock_ufs2: Unknown " "state 0x%x", adp->ad_state); #endif /* INVARIANTS */ adp->ad_state &= ~ATTACHED; adp->ad_state |= UNDONE; } /* * The on-disk inode cannot claim to be any larger than the last * fragment that has been written. Otherwise, the on-disk inode * might have fragments that were not the last block in the ext * data which would corrupt the filesystem. */ for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_extupdt); adp; lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) { dp->di_extb[adp->ad_offset] = adp->ad_oldblkno; /* keep going until hitting a rollback to a frag */ if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize) continue; dp->di_extsize = fs->fs_bsize * adp->ad_offset + adp->ad_oldsize; for (i = adp->ad_offset + 1; i < UFS_NXADDR; i++) { #ifdef INVARIANTS if (dp->di_extb[i] != 0 && (deplist & (1 << i)) == 0) panic("initiate_write_inodeblock_ufs2: " "lost dep1"); #endif /* INVARIANTS */ dp->di_extb[i] = 0; } lastadp = NULL; break; } /* * If we have zero'ed out the last allocated block of the ext * data, roll back the size to the last currently allocated block. * We know that this last allocated block is a full-sized as * we already checked for fragments in the loop above. */ if (lastadp != NULL && dp->di_extsize <= (lastadp->ad_offset + 1) * fs->fs_bsize) { for (i = lastadp->ad_offset; i >= 0; i--) if (dp->di_extb[i] != 0) break; dp->di_extsize = (i + 1) * fs->fs_bsize; } /* * Set the file data dependencies to busy. */ for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; adp = TAILQ_NEXT(adp, ad_next)) { #ifdef INVARIANTS if (deplist != 0 && prevlbn >= adp->ad_offset) panic("softdep_write_inodeblock: lbn order"); if ((adp->ad_state & ATTACHED) == 0) panic("inodedep %p and adp %p not attached", inodedep, adp); prevlbn = adp->ad_offset; if (!ffs_fsfail_cleanup(ump, 0) && adp->ad_offset < UFS_NDADDR && dp->di_db[adp->ad_offset] != adp->ad_newblkno) panic("initiate_write_inodeblock_ufs2: " "direct pointer #%jd mismatch %jd != %jd", (intmax_t)adp->ad_offset, (intmax_t)dp->di_db[adp->ad_offset], (intmax_t)adp->ad_newblkno); if (!ffs_fsfail_cleanup(ump, 0) && adp->ad_offset >= UFS_NDADDR && dp->di_ib[adp->ad_offset - UFS_NDADDR] != adp->ad_newblkno) panic("initiate_write_inodeblock_ufs2: " "indirect pointer #%jd mismatch %jd != %jd", (intmax_t)adp->ad_offset - UFS_NDADDR, (intmax_t)dp->di_ib[adp->ad_offset - UFS_NDADDR], (intmax_t)adp->ad_newblkno); deplist |= 1 << adp->ad_offset; if ((adp->ad_state & ATTACHED) == 0) panic("initiate_write_inodeblock_ufs2: Unknown " "state 0x%x", adp->ad_state); #endif /* INVARIANTS */ adp->ad_state &= ~ATTACHED; adp->ad_state |= UNDONE; } /* * The on-disk inode cannot claim to be any larger than the last * fragment that has been written. Otherwise, the on-disk inode * might have fragments that were not the last block in the file * which would corrupt the filesystem. */ for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) { if (adp->ad_offset >= UFS_NDADDR) break; dp->di_db[adp->ad_offset] = adp->ad_oldblkno; /* keep going until hitting a rollback to a frag */ if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize) continue; dp->di_size = fs->fs_bsize * adp->ad_offset + adp->ad_oldsize; for (i = adp->ad_offset + 1; i < UFS_NDADDR; i++) { #ifdef INVARIANTS if (dp->di_db[i] != 0 && (deplist & (1 << i)) == 0) panic("initiate_write_inodeblock_ufs2: " "lost dep2"); #endif /* INVARIANTS */ dp->di_db[i] = 0; } for (i = 0; i < UFS_NIADDR; i++) { #ifdef INVARIANTS if (dp->di_ib[i] != 0 && (deplist & ((1 << UFS_NDADDR) << i)) == 0) panic("initiate_write_inodeblock_ufs2: " "lost dep3"); #endif /* INVARIANTS */ dp->di_ib[i] = 0; } ffs_update_dinode_ckhash(fs, dp); return; } /* * If we have zero'ed out the last allocated block of the file, * roll back the size to the last currently allocated block. * We know that this last allocated block is a full-sized as * we already checked for fragments in the loop above. */ if (lastadp != NULL && dp->di_size <= (lastadp->ad_offset + 1) * fs->fs_bsize) { for (i = lastadp->ad_offset; i >= 0; i--) if (dp->di_db[i] != 0) break; dp->di_size = (i + 1) * fs->fs_bsize; } /* * The only dependencies are for indirect blocks. * * The file size for indirect block additions is not guaranteed. * Such a guarantee would be non-trivial to achieve. The conventional * synchronous write implementation also does not make this guarantee. * Fsck should catch and fix discrepancies. Arguably, the file size * can be over-estimated without destroying integrity when the file * moves into the indirect blocks (i.e., is large). If we want to * postpone fsck, we are stuck with this argument. */ for (; adp; adp = TAILQ_NEXT(adp, ad_next)) dp->di_ib[adp->ad_offset - UFS_NDADDR] = 0; ffs_update_dinode_ckhash(fs, dp); } /* * Cancel an indirdep as a result of truncation. Release all of the * children allocindirs and place their journal work on the appropriate * list. */ static void cancel_indirdep(indirdep, bp, freeblks) struct indirdep *indirdep; struct buf *bp; struct freeblks *freeblks; { struct allocindir *aip; /* * None of the indirect pointers will ever be visible, * so they can simply be tossed. GOINGAWAY ensures * that allocated pointers will be saved in the buffer * cache until they are freed. Note that they will * only be able to be found by their physical address * since the inode mapping the logical address will * be gone. The save buffer used for the safe copy * was allocated in setup_allocindir_phase2 using * the physical address so it could be used for this * purpose. Hence we swap the safe copy with the real * copy, allowing the safe copy to be freed and holding * on to the real copy for later use in indir_trunc. */ if (indirdep->ir_state & GOINGAWAY) panic("cancel_indirdep: already gone"); if ((indirdep->ir_state & DEPCOMPLETE) == 0) { indirdep->ir_state |= DEPCOMPLETE; LIST_REMOVE(indirdep, ir_next); } indirdep->ir_state |= GOINGAWAY; /* * Pass in bp for blocks still have journal writes * pending so we can cancel them on their own. */ while ((aip = LIST_FIRST(&indirdep->ir_deplisthd)) != NULL) cancel_allocindir(aip, bp, freeblks, 0); while ((aip = LIST_FIRST(&indirdep->ir_donehd)) != NULL) cancel_allocindir(aip, NULL, freeblks, 0); while ((aip = LIST_FIRST(&indirdep->ir_writehd)) != NULL) cancel_allocindir(aip, NULL, freeblks, 0); while ((aip = LIST_FIRST(&indirdep->ir_completehd)) != NULL) cancel_allocindir(aip, NULL, freeblks, 0); /* * If there are pending partial truncations we need to keep the * old block copy around until they complete. This is because * the current b_data is not a perfect superset of the available * blocks. */ if (TAILQ_EMPTY(&indirdep->ir_trunc)) bcopy(bp->b_data, indirdep->ir_savebp->b_data, bp->b_bcount); else bcopy(bp->b_data, indirdep->ir_saveddata, bp->b_bcount); WORKLIST_REMOVE(&indirdep->ir_list); WORKLIST_INSERT(&indirdep->ir_savebp->b_dep, &indirdep->ir_list); indirdep->ir_bp = NULL; indirdep->ir_freeblks = freeblks; } /* * Free an indirdep once it no longer has new pointers to track. */ static void free_indirdep(indirdep) struct indirdep *indirdep; { KASSERT(TAILQ_EMPTY(&indirdep->ir_trunc), ("free_indirdep: Indir trunc list not empty.")); KASSERT(LIST_EMPTY(&indirdep->ir_completehd), ("free_indirdep: Complete head not empty.")); KASSERT(LIST_EMPTY(&indirdep->ir_writehd), ("free_indirdep: write head not empty.")); KASSERT(LIST_EMPTY(&indirdep->ir_donehd), ("free_indirdep: done head not empty.")); KASSERT(LIST_EMPTY(&indirdep->ir_deplisthd), ("free_indirdep: deplist head not empty.")); KASSERT((indirdep->ir_state & DEPCOMPLETE), ("free_indirdep: %p still on newblk list.", indirdep)); KASSERT(indirdep->ir_saveddata == NULL, ("free_indirdep: %p still has saved data.", indirdep)); KASSERT(indirdep->ir_savebp == NULL, ("free_indirdep: %p still has savebp buffer.", indirdep)); if (indirdep->ir_state & ONWORKLIST) WORKLIST_REMOVE(&indirdep->ir_list); WORKITEM_FREE(indirdep, D_INDIRDEP); } /* * Called before a write to an indirdep. This routine is responsible for * rolling back pointers to a safe state which includes only those * allocindirs which have been completed. */ static void initiate_write_indirdep(indirdep, bp) struct indirdep *indirdep; struct buf *bp; { struct ufsmount *ump; indirdep->ir_state |= IOSTARTED; if (indirdep->ir_state & GOINGAWAY) panic("disk_io_initiation: indirdep gone"); /* * If there are no remaining dependencies, this will be writing * the real pointers. */ if (LIST_EMPTY(&indirdep->ir_deplisthd) && TAILQ_EMPTY(&indirdep->ir_trunc)) return; /* * Replace up-to-date version with safe version. */ if (indirdep->ir_saveddata == NULL) { ump = VFSTOUFS(indirdep->ir_list.wk_mp); LOCK_OWNED(ump); FREE_LOCK(ump); indirdep->ir_saveddata = malloc(bp->b_bcount, M_INDIRDEP, M_SOFTDEP_FLAGS); ACQUIRE_LOCK(ump); } indirdep->ir_state &= ~ATTACHED; indirdep->ir_state |= UNDONE; bcopy(bp->b_data, indirdep->ir_saveddata, bp->b_bcount); bcopy(indirdep->ir_savebp->b_data, bp->b_data, bp->b_bcount); } /* * Called when an inode has been cleared in a cg bitmap. This finally * eliminates any canceled jaddrefs */ void softdep_setup_inofree(mp, bp, ino, wkhd) struct mount *mp; struct buf *bp; ino_t ino; struct workhead *wkhd; { struct worklist *wk, *wkn; struct inodedep *inodedep; struct ufsmount *ump; uint8_t *inosused; struct cg *cgp; struct fs *fs; KASSERT(MOUNTEDSOFTDEP(mp) != 0, ("softdep_setup_inofree called on non-softdep filesystem")); ump = VFSTOUFS(mp); ACQUIRE_LOCK(ump); if (!ffs_fsfail_cleanup(ump, 0)) { fs = ump->um_fs; cgp = (struct cg *)bp->b_data; inosused = cg_inosused(cgp); if (isset(inosused, ino % fs->fs_ipg)) panic("softdep_setup_inofree: inode %ju not freed.", (uintmax_t)ino); } if (inodedep_lookup(mp, ino, 0, &inodedep)) panic("softdep_setup_inofree: ino %ju has existing inodedep %p", (uintmax_t)ino, inodedep); if (wkhd) { LIST_FOREACH_SAFE(wk, wkhd, wk_list, wkn) { if (wk->wk_type != D_JADDREF) continue; WORKLIST_REMOVE(wk); /* * We can free immediately even if the jaddref * isn't attached in a background write as now * the bitmaps are reconciled. */ wk->wk_state |= COMPLETE | ATTACHED; free_jaddref(WK_JADDREF(wk)); } jwork_move(&bp->b_dep, wkhd); } FREE_LOCK(ump); } /* * Called via ffs_blkfree() after a set of frags has been cleared from a cg * map. Any dependencies waiting for the write to clear are added to the * buf's list and any jnewblks that are being canceled are discarded * immediately. */ void softdep_setup_blkfree(mp, bp, blkno, frags, wkhd) struct mount *mp; struct buf *bp; ufs2_daddr_t blkno; int frags; struct workhead *wkhd; { struct bmsafemap *bmsafemap; struct jnewblk *jnewblk; struct ufsmount *ump; struct worklist *wk; struct fs *fs; #ifdef INVARIANTS uint8_t *blksfree; struct cg *cgp; ufs2_daddr_t jstart; ufs2_daddr_t jend; ufs2_daddr_t end; long bno; int i; #endif CTR3(KTR_SUJ, "softdep_setup_blkfree: blkno %jd frags %d wk head %p", blkno, frags, wkhd); ump = VFSTOUFS(mp); KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(ump)) != 0, ("softdep_setup_blkfree called on non-softdep filesystem")); ACQUIRE_LOCK(ump); /* Lookup the bmsafemap so we track when it is dirty. */ fs = ump->um_fs; bmsafemap = bmsafemap_lookup(mp, bp, dtog(fs, blkno), NULL); /* * Detach any jnewblks which have been canceled. They must linger * until the bitmap is cleared again by ffs_blkfree() to prevent * an unjournaled allocation from hitting the disk. */ if (wkhd) { while ((wk = LIST_FIRST(wkhd)) != NULL) { CTR2(KTR_SUJ, "softdep_setup_blkfree: blkno %jd wk type %d", blkno, wk->wk_type); WORKLIST_REMOVE(wk); if (wk->wk_type != D_JNEWBLK) { WORKLIST_INSERT(&bmsafemap->sm_freehd, wk); continue; } jnewblk = WK_JNEWBLK(wk); KASSERT(jnewblk->jn_state & GOINGAWAY, ("softdep_setup_blkfree: jnewblk not canceled.")); #ifdef INVARIANTS /* * Assert that this block is free in the bitmap * before we discard the jnewblk. */ cgp = (struct cg *)bp->b_data; blksfree = cg_blksfree(cgp); bno = dtogd(fs, jnewblk->jn_blkno); for (i = jnewblk->jn_oldfrags; i < jnewblk->jn_frags; i++) { if (isset(blksfree, bno + i)) continue; panic("softdep_setup_blkfree: not free"); } #endif /* * Even if it's not attached we can free immediately * as the new bitmap is correct. */ wk->wk_state |= COMPLETE | ATTACHED; free_jnewblk(jnewblk); } } #ifdef INVARIANTS /* * Assert that we are not freeing a block which has an outstanding * allocation dependency. */ fs = VFSTOUFS(mp)->um_fs; bmsafemap = bmsafemap_lookup(mp, bp, dtog(fs, blkno), NULL); end = blkno + frags; LIST_FOREACH(jnewblk, &bmsafemap->sm_jnewblkhd, jn_deps) { /* * Don't match against blocks that will be freed when the * background write is done. */ if ((jnewblk->jn_state & (ATTACHED | COMPLETE | DEPCOMPLETE)) == (COMPLETE | DEPCOMPLETE)) continue; jstart = jnewblk->jn_blkno + jnewblk->jn_oldfrags; jend = jnewblk->jn_blkno + jnewblk->jn_frags; if ((blkno >= jstart && blkno < jend) || (end > jstart && end <= jend)) { printf("state 0x%X %jd - %d %d dep %p\n", jnewblk->jn_state, jnewblk->jn_blkno, jnewblk->jn_oldfrags, jnewblk->jn_frags, jnewblk->jn_dep); panic("softdep_setup_blkfree: " "%jd-%jd(%d) overlaps with %jd-%jd", blkno, end, frags, jstart, jend); } } #endif FREE_LOCK(ump); } /* * Revert a block allocation when the journal record that describes it * is not yet written. */ static int jnewblk_rollback(jnewblk, fs, cgp, blksfree) struct jnewblk *jnewblk; struct fs *fs; struct cg *cgp; uint8_t *blksfree; { ufs1_daddr_t fragno; long cgbno, bbase; int frags, blk; int i; frags = 0; cgbno = dtogd(fs, jnewblk->jn_blkno); /* * We have to test which frags need to be rolled back. We may * be operating on a stale copy when doing background writes. */ for (i = jnewblk->jn_oldfrags; i < jnewblk->jn_frags; i++) if (isclr(blksfree, cgbno + i)) frags++; if (frags == 0) return (0); /* * This is mostly ffs_blkfree() sans some validation and * superblock updates. */ if (frags == fs->fs_frag) { fragno = fragstoblks(fs, cgbno); ffs_setblock(fs, blksfree, fragno); ffs_clusteracct(fs, cgp, fragno, 1); cgp->cg_cs.cs_nbfree++; } else { cgbno += jnewblk->jn_oldfrags; bbase = cgbno - fragnum(fs, cgbno); /* Decrement the old frags. */ blk = blkmap(fs, blksfree, bbase); ffs_fragacct(fs, blk, cgp->cg_frsum, -1); /* Deallocate the fragment */ for (i = 0; i < frags; i++) setbit(blksfree, cgbno + i); cgp->cg_cs.cs_nffree += frags; /* Add back in counts associated with the new frags */ blk = blkmap(fs, blksfree, bbase); ffs_fragacct(fs, blk, cgp->cg_frsum, 1); /* If a complete block has been reassembled, account for it. */ fragno = fragstoblks(fs, bbase); if (ffs_isblock(fs, blksfree, fragno)) { cgp->cg_cs.cs_nffree -= fs->fs_frag; ffs_clusteracct(fs, cgp, fragno, 1); cgp->cg_cs.cs_nbfree++; } } stat_jnewblk++; jnewblk->jn_state &= ~ATTACHED; jnewblk->jn_state |= UNDONE; return (frags); } static void initiate_write_bmsafemap(bmsafemap, bp) struct bmsafemap *bmsafemap; struct buf *bp; /* The cg block. */ { struct jaddref *jaddref; struct jnewblk *jnewblk; uint8_t *inosused; uint8_t *blksfree; struct cg *cgp; struct fs *fs; ino_t ino; /* * If this is a background write, we did this at the time that * the copy was made, so do not need to do it again. */ if (bmsafemap->sm_state & IOSTARTED) return; bmsafemap->sm_state |= IOSTARTED; /* * Clear any inode allocations which are pending journal writes. */ if (LIST_FIRST(&bmsafemap->sm_jaddrefhd) != NULL) { cgp = (struct cg *)bp->b_data; fs = VFSTOUFS(bmsafemap->sm_list.wk_mp)->um_fs; inosused = cg_inosused(cgp); LIST_FOREACH(jaddref, &bmsafemap->sm_jaddrefhd, ja_bmdeps) { ino = jaddref->ja_ino % fs->fs_ipg; if (isset(inosused, ino)) { if ((jaddref->ja_mode & IFMT) == IFDIR) cgp->cg_cs.cs_ndir--; cgp->cg_cs.cs_nifree++; clrbit(inosused, ino); jaddref->ja_state &= ~ATTACHED; jaddref->ja_state |= UNDONE; stat_jaddref++; } else panic("initiate_write_bmsafemap: inode %ju " "marked free", (uintmax_t)jaddref->ja_ino); } } /* * Clear any block allocations which are pending journal writes. */ if (LIST_FIRST(&bmsafemap->sm_jnewblkhd) != NULL) { cgp = (struct cg *)bp->b_data; fs = VFSTOUFS(bmsafemap->sm_list.wk_mp)->um_fs; blksfree = cg_blksfree(cgp); LIST_FOREACH(jnewblk, &bmsafemap->sm_jnewblkhd, jn_deps) { if (jnewblk_rollback(jnewblk, fs, cgp, blksfree)) continue; panic("initiate_write_bmsafemap: block %jd " "marked free", jnewblk->jn_blkno); } } /* * Move allocation lists to the written lists so they can be * cleared once the block write is complete. */ LIST_SWAP(&bmsafemap->sm_inodedephd, &bmsafemap->sm_inodedepwr, inodedep, id_deps); LIST_SWAP(&bmsafemap->sm_newblkhd, &bmsafemap->sm_newblkwr, newblk, nb_deps); LIST_SWAP(&bmsafemap->sm_freehd, &bmsafemap->sm_freewr, worklist, wk_list); } void softdep_handle_error(struct buf *bp) { struct ufsmount *ump; ump = softdep_bp_to_mp(bp); if (ump == NULL) return; if (ffs_fsfail_cleanup(ump, bp->b_error)) { /* * No future writes will succeed, so the on-disk image is safe. * Pretend that this write succeeded so that the softdep state * will be cleaned up naturally. */ bp->b_ioflags &= ~BIO_ERROR; bp->b_error = 0; } } /* * This routine is called during the completion interrupt * service routine for a disk write (from the procedure called * by the device driver to inform the filesystem caches of * a request completion). It should be called early in this * procedure, before the block is made available to other * processes or other routines are called. * */ static void softdep_disk_write_complete(bp) struct buf *bp; /* describes the completed disk write */ { struct worklist *wk; struct worklist *owk; struct ufsmount *ump; struct workhead reattach; struct freeblks *freeblks; struct buf *sbp; ump = softdep_bp_to_mp(bp); KASSERT(LIST_EMPTY(&bp->b_dep) || ump != NULL, ("softdep_disk_write_complete: softdep_bp_to_mp returned NULL " "with outstanding dependencies for buffer %p", bp)); if (ump == NULL) return; if ((bp->b_ioflags & BIO_ERROR) != 0) softdep_handle_error(bp); /* * If an error occurred while doing the write, then the data * has not hit the disk and the dependencies cannot be processed. * But we do have to go through and roll forward any dependencies * that were rolled back before the disk write. */ sbp = NULL; ACQUIRE_LOCK(ump); if ((bp->b_ioflags & BIO_ERROR) != 0 && (bp->b_flags & B_INVAL) == 0) { LIST_FOREACH(wk, &bp->b_dep, wk_list) { switch (wk->wk_type) { case D_PAGEDEP: handle_written_filepage(WK_PAGEDEP(wk), bp, 0); continue; case D_INODEDEP: handle_written_inodeblock(WK_INODEDEP(wk), bp, 0); continue; case D_BMSAFEMAP: handle_written_bmsafemap(WK_BMSAFEMAP(wk), bp, 0); continue; case D_INDIRDEP: handle_written_indirdep(WK_INDIRDEP(wk), bp, &sbp, 0); continue; default: /* nothing to roll forward */ continue; } } FREE_LOCK(ump); if (sbp) brelse(sbp); return; } LIST_INIT(&reattach); /* * Ump SU lock must not be released anywhere in this code segment. */ owk = NULL; while ((wk = LIST_FIRST(&bp->b_dep)) != NULL) { WORKLIST_REMOVE(wk); atomic_add_long(&dep_write[wk->wk_type], 1); if (wk == owk) panic("duplicate worklist: %p\n", wk); owk = wk; switch (wk->wk_type) { case D_PAGEDEP: if (handle_written_filepage(WK_PAGEDEP(wk), bp, WRITESUCCEEDED)) WORKLIST_INSERT(&reattach, wk); continue; case D_INODEDEP: if (handle_written_inodeblock(WK_INODEDEP(wk), bp, WRITESUCCEEDED)) WORKLIST_INSERT(&reattach, wk); continue; case D_BMSAFEMAP: if (handle_written_bmsafemap(WK_BMSAFEMAP(wk), bp, WRITESUCCEEDED)) WORKLIST_INSERT(&reattach, wk); continue; case D_MKDIR: handle_written_mkdir(WK_MKDIR(wk), MKDIR_BODY); continue; case D_ALLOCDIRECT: wk->wk_state |= COMPLETE; handle_allocdirect_partdone(WK_ALLOCDIRECT(wk), NULL); continue; case D_ALLOCINDIR: wk->wk_state |= COMPLETE; handle_allocindir_partdone(WK_ALLOCINDIR(wk)); continue; case D_INDIRDEP: if (handle_written_indirdep(WK_INDIRDEP(wk), bp, &sbp, WRITESUCCEEDED)) WORKLIST_INSERT(&reattach, wk); continue; case D_FREEBLKS: wk->wk_state |= COMPLETE; freeblks = WK_FREEBLKS(wk); if ((wk->wk_state & ALLCOMPLETE) == ALLCOMPLETE && LIST_EMPTY(&freeblks->fb_jblkdephd)) add_to_worklist(wk, WK_NODELAY); continue; case D_FREEWORK: handle_written_freework(WK_FREEWORK(wk)); break; case D_JSEGDEP: free_jsegdep(WK_JSEGDEP(wk)); continue; case D_JSEG: handle_written_jseg(WK_JSEG(wk), bp); continue; case D_SBDEP: if (handle_written_sbdep(WK_SBDEP(wk), bp)) WORKLIST_INSERT(&reattach, wk); continue; case D_FREEDEP: free_freedep(WK_FREEDEP(wk)); continue; default: panic("handle_disk_write_complete: Unknown type %s", TYPENAME(wk->wk_type)); /* NOTREACHED */ } } /* * Reattach any requests that must be redone. */ while ((wk = LIST_FIRST(&reattach)) != NULL) { WORKLIST_REMOVE(wk); WORKLIST_INSERT(&bp->b_dep, wk); } FREE_LOCK(ump); if (sbp) brelse(sbp); } /* * Called from within softdep_disk_write_complete above. */ static void handle_allocdirect_partdone(adp, wkhd) struct allocdirect *adp; /* the completed allocdirect */ struct workhead *wkhd; /* Work to do when inode is writtne. */ { struct allocdirectlst *listhead; struct allocdirect *listadp; struct inodedep *inodedep; long bsize; LOCK_OWNED(VFSTOUFS(adp->ad_block.nb_list.wk_mp)); if ((adp->ad_state & ALLCOMPLETE) != ALLCOMPLETE) return; /* * The on-disk inode cannot claim to be any larger than the last * fragment that has been written. Otherwise, the on-disk inode * might have fragments that were not the last block in the file * which would corrupt the filesystem. Thus, we cannot free any * allocdirects after one whose ad_oldblkno claims a fragment as * these blocks must be rolled back to zero before writing the inode. * We check the currently active set of allocdirects in id_inoupdt * or id_extupdt as appropriate. */ inodedep = adp->ad_inodedep; bsize = inodedep->id_fs->fs_bsize; if (adp->ad_state & EXTDATA) listhead = &inodedep->id_extupdt; else listhead = &inodedep->id_inoupdt; TAILQ_FOREACH(listadp, listhead, ad_next) { /* found our block */ if (listadp == adp) break; /* continue if ad_oldlbn is not a fragment */ if (listadp->ad_oldsize == 0 || listadp->ad_oldsize == bsize) continue; /* hit a fragment */ return; } /* * If we have reached the end of the current list without * finding the just finished dependency, then it must be * on the future dependency list. Future dependencies cannot * be freed until they are moved to the current list. */ if (listadp == NULL) { #ifdef INVARIANTS if (adp->ad_state & EXTDATA) listhead = &inodedep->id_newextupdt; else listhead = &inodedep->id_newinoupdt; TAILQ_FOREACH(listadp, listhead, ad_next) /* found our block */ if (listadp == adp) break; if (listadp == NULL) panic("handle_allocdirect_partdone: lost dep"); #endif /* INVARIANTS */ return; } /* * If we have found the just finished dependency, then queue * it along with anything that follows it that is complete. * Since the pointer has not yet been written in the inode * as the dependency prevents it, place the allocdirect on the * bufwait list where it will be freed once the pointer is * valid. */ if (wkhd == NULL) wkhd = &inodedep->id_bufwait; for (; adp; adp = listadp) { listadp = TAILQ_NEXT(adp, ad_next); if ((adp->ad_state & ALLCOMPLETE) != ALLCOMPLETE) return; TAILQ_REMOVE(listhead, adp, ad_next); WORKLIST_INSERT(wkhd, &adp->ad_block.nb_list); } } /* * Called from within softdep_disk_write_complete above. This routine * completes successfully written allocindirs. */ static void handle_allocindir_partdone(aip) struct allocindir *aip; /* the completed allocindir */ { struct indirdep *indirdep; if ((aip->ai_state & ALLCOMPLETE) != ALLCOMPLETE) return; indirdep = aip->ai_indirdep; LIST_REMOVE(aip, ai_next); /* * Don't set a pointer while the buffer is undergoing IO or while * we have active truncations. */ if (indirdep->ir_state & UNDONE || !TAILQ_EMPTY(&indirdep->ir_trunc)) { LIST_INSERT_HEAD(&indirdep->ir_donehd, aip, ai_next); return; } if (indirdep->ir_state & UFS1FMT) ((ufs1_daddr_t *)indirdep->ir_savebp->b_data)[aip->ai_offset] = aip->ai_newblkno; else ((ufs2_daddr_t *)indirdep->ir_savebp->b_data)[aip->ai_offset] = aip->ai_newblkno; /* * Await the pointer write before freeing the allocindir. */ LIST_INSERT_HEAD(&indirdep->ir_writehd, aip, ai_next); } /* * Release segments held on a jwork list. */ static void handle_jwork(wkhd) struct workhead *wkhd; { struct worklist *wk; while ((wk = LIST_FIRST(wkhd)) != NULL) { WORKLIST_REMOVE(wk); switch (wk->wk_type) { case D_JSEGDEP: free_jsegdep(WK_JSEGDEP(wk)); continue; case D_FREEDEP: free_freedep(WK_FREEDEP(wk)); continue; case D_FREEFRAG: rele_jseg(WK_JSEG(WK_FREEFRAG(wk)->ff_jdep)); WORKITEM_FREE(wk, D_FREEFRAG); continue; case D_FREEWORK: handle_written_freework(WK_FREEWORK(wk)); continue; default: panic("handle_jwork: Unknown type %s\n", TYPENAME(wk->wk_type)); } } } /* * Handle the bufwait list on an inode when it is safe to release items * held there. This normally happens after an inode block is written but * may be delayed and handled later if there are pending journal items that * are not yet safe to be released. */ static struct freefile * handle_bufwait(inodedep, refhd) struct inodedep *inodedep; struct workhead *refhd; { struct jaddref *jaddref; struct freefile *freefile; struct worklist *wk; freefile = NULL; while ((wk = LIST_FIRST(&inodedep->id_bufwait)) != NULL) { WORKLIST_REMOVE(wk); switch (wk->wk_type) { case D_FREEFILE: /* * We defer adding freefile to the worklist * until all other additions have been made to * ensure that it will be done after all the * old blocks have been freed. */ if (freefile != NULL) panic("handle_bufwait: freefile"); freefile = WK_FREEFILE(wk); continue; case D_MKDIR: handle_written_mkdir(WK_MKDIR(wk), MKDIR_PARENT); continue; case D_DIRADD: diradd_inode_written(WK_DIRADD(wk), inodedep); continue; case D_FREEFRAG: wk->wk_state |= COMPLETE; if ((wk->wk_state & ALLCOMPLETE) == ALLCOMPLETE) add_to_worklist(wk, 0); continue; case D_DIRREM: wk->wk_state |= COMPLETE; add_to_worklist(wk, 0); continue; case D_ALLOCDIRECT: case D_ALLOCINDIR: free_newblk(WK_NEWBLK(wk)); continue; case D_JNEWBLK: wk->wk_state |= COMPLETE; free_jnewblk(WK_JNEWBLK(wk)); continue; /* * Save freed journal segments and add references on * the supplied list which will delay their release * until the cg bitmap is cleared on disk. */ case D_JSEGDEP: if (refhd == NULL) free_jsegdep(WK_JSEGDEP(wk)); else WORKLIST_INSERT(refhd, wk); continue; case D_JADDREF: jaddref = WK_JADDREF(wk); TAILQ_REMOVE(&inodedep->id_inoreflst, &jaddref->ja_ref, if_deps); /* * Transfer any jaddrefs to the list to be freed with * the bitmap if we're handling a removed file. */ if (refhd == NULL) { wk->wk_state |= COMPLETE; free_jaddref(jaddref); } else WORKLIST_INSERT(refhd, wk); continue; default: panic("handle_bufwait: Unknown type %p(%s)", wk, TYPENAME(wk->wk_type)); /* NOTREACHED */ } } return (freefile); } /* * Called from within softdep_disk_write_complete above to restore * in-memory inode block contents to their most up-to-date state. Note * that this routine is always called from interrupt level with further * interrupts from this device blocked. * * If the write did not succeed, we will do all the roll-forward * operations, but we will not take the actions that will allow its * dependencies to be processed. */ static int handle_written_inodeblock(inodedep, bp, flags) struct inodedep *inodedep; struct buf *bp; /* buffer containing the inode block */ int flags; { struct freefile *freefile; struct allocdirect *adp, *nextadp; struct ufs1_dinode *dp1 = NULL; struct ufs2_dinode *dp2 = NULL; struct workhead wkhd; int hadchanges, fstype; ino_t freelink; LIST_INIT(&wkhd); hadchanges = 0; freefile = NULL; if ((inodedep->id_state & IOSTARTED) == 0) panic("handle_written_inodeblock: not started"); inodedep->id_state &= ~IOSTARTED; if (inodedep->id_fs->fs_magic == FS_UFS1_MAGIC) { fstype = UFS1; dp1 = (struct ufs1_dinode *)bp->b_data + ino_to_fsbo(inodedep->id_fs, inodedep->id_ino); freelink = dp1->di_freelink; } else { fstype = UFS2; dp2 = (struct ufs2_dinode *)bp->b_data + ino_to_fsbo(inodedep->id_fs, inodedep->id_ino); freelink = dp2->di_freelink; } /* * Leave this inodeblock dirty until it's in the list. */ if ((inodedep->id_state & (UNLINKED | UNLINKONLIST)) == UNLINKED && (flags & WRITESUCCEEDED)) { struct inodedep *inon; inon = TAILQ_NEXT(inodedep, id_unlinked); if ((inon == NULL && freelink == 0) || (inon && inon->id_ino == freelink)) { if (inon) inon->id_state |= UNLINKPREV; inodedep->id_state |= UNLINKNEXT; } hadchanges = 1; } /* * If we had to rollback the inode allocation because of * bitmaps being incomplete, then simply restore it. * Keep the block dirty so that it will not be reclaimed until * all associated dependencies have been cleared and the * corresponding updates written to disk. */ if (inodedep->id_savedino1 != NULL) { hadchanges = 1; if (fstype == UFS1) *dp1 = *inodedep->id_savedino1; else *dp2 = *inodedep->id_savedino2; free(inodedep->id_savedino1, M_SAVEDINO); inodedep->id_savedino1 = NULL; if ((bp->b_flags & B_DELWRI) == 0) stat_inode_bitmap++; bdirty(bp); /* * If the inode is clear here and GOINGAWAY it will never * be written. Process the bufwait and clear any pending * work which may include the freefile. */ if (inodedep->id_state & GOINGAWAY) goto bufwait; return (1); } if (flags & WRITESUCCEEDED) inodedep->id_state |= COMPLETE; /* * Roll forward anything that had to be rolled back before * the inode could be updated. */ for (adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; adp = nextadp) { nextadp = TAILQ_NEXT(adp, ad_next); if (adp->ad_state & ATTACHED) panic("handle_written_inodeblock: new entry"); if (fstype == UFS1) { if (adp->ad_offset < UFS_NDADDR) { if (dp1->di_db[adp->ad_offset]!=adp->ad_oldblkno) panic("%s %s #%jd mismatch %d != %jd", "handle_written_inodeblock:", "direct pointer", (intmax_t)adp->ad_offset, dp1->di_db[adp->ad_offset], (intmax_t)adp->ad_oldblkno); dp1->di_db[adp->ad_offset] = adp->ad_newblkno; } else { if (dp1->di_ib[adp->ad_offset - UFS_NDADDR] != 0) panic("%s: %s #%jd allocated as %d", "handle_written_inodeblock", "indirect pointer", (intmax_t)adp->ad_offset - UFS_NDADDR, dp1->di_ib[adp->ad_offset - UFS_NDADDR]); dp1->di_ib[adp->ad_offset - UFS_NDADDR] = adp->ad_newblkno; } } else { if (adp->ad_offset < UFS_NDADDR) { if (dp2->di_db[adp->ad_offset]!=adp->ad_oldblkno) panic("%s: %s #%jd %s %jd != %jd", "handle_written_inodeblock", "direct pointer", (intmax_t)adp->ad_offset, "mismatch", (intmax_t)dp2->di_db[adp->ad_offset], (intmax_t)adp->ad_oldblkno); dp2->di_db[adp->ad_offset] = adp->ad_newblkno; } else { if (dp2->di_ib[adp->ad_offset - UFS_NDADDR] != 0) panic("%s: %s #%jd allocated as %jd", "handle_written_inodeblock", "indirect pointer", (intmax_t)adp->ad_offset - UFS_NDADDR, (intmax_t) dp2->di_ib[adp->ad_offset - UFS_NDADDR]); dp2->di_ib[adp->ad_offset - UFS_NDADDR] = adp->ad_newblkno; } } adp->ad_state &= ~UNDONE; adp->ad_state |= ATTACHED; hadchanges = 1; } for (adp = TAILQ_FIRST(&inodedep->id_extupdt); adp; adp = nextadp) { nextadp = TAILQ_NEXT(adp, ad_next); if (adp->ad_state & ATTACHED) panic("handle_written_inodeblock: new entry"); if (dp2->di_extb[adp->ad_offset] != adp->ad_oldblkno) panic("%s: direct pointers #%jd %s %jd != %jd", "handle_written_inodeblock", (intmax_t)adp->ad_offset, "mismatch", (intmax_t)dp2->di_extb[adp->ad_offset], (intmax_t)adp->ad_oldblkno); dp2->di_extb[adp->ad_offset] = adp->ad_newblkno; adp->ad_state &= ~UNDONE; adp->ad_state |= ATTACHED; hadchanges = 1; } if (hadchanges && (bp->b_flags & B_DELWRI) == 0) stat_direct_blk_ptrs++; /* * Reset the file size to its most up-to-date value. */ if (inodedep->id_savedsize == -1 || inodedep->id_savedextsize == -1) panic("handle_written_inodeblock: bad size"); if (inodedep->id_savednlink > UFS_LINK_MAX) panic("handle_written_inodeblock: Invalid link count " "%jd for inodedep %p", (uintmax_t)inodedep->id_savednlink, inodedep); if (fstype == UFS1) { if (dp1->di_nlink != inodedep->id_savednlink) { dp1->di_nlink = inodedep->id_savednlink; hadchanges = 1; } if (dp1->di_size != inodedep->id_savedsize) { dp1->di_size = inodedep->id_savedsize; hadchanges = 1; } } else { if (dp2->di_nlink != inodedep->id_savednlink) { dp2->di_nlink = inodedep->id_savednlink; hadchanges = 1; } if (dp2->di_size != inodedep->id_savedsize) { dp2->di_size = inodedep->id_savedsize; hadchanges = 1; } if (dp2->di_extsize != inodedep->id_savedextsize) { dp2->di_extsize = inodedep->id_savedextsize; hadchanges = 1; } } inodedep->id_savedsize = -1; inodedep->id_savedextsize = -1; inodedep->id_savednlink = -1; /* * If there were any rollbacks in the inode block, then it must be * marked dirty so that its will eventually get written back in * its correct form. */ if (hadchanges) { if (fstype == UFS2) ffs_update_dinode_ckhash(inodedep->id_fs, dp2); bdirty(bp); } bufwait: /* * If the write did not succeed, we have done all the roll-forward * operations, but we cannot take the actions that will allow its * dependencies to be processed. */ if ((flags & WRITESUCCEEDED) == 0) return (hadchanges); /* * Process any allocdirects that completed during the update. */ if ((adp = TAILQ_FIRST(&inodedep->id_inoupdt)) != NULL) handle_allocdirect_partdone(adp, &wkhd); if ((adp = TAILQ_FIRST(&inodedep->id_extupdt)) != NULL) handle_allocdirect_partdone(adp, &wkhd); /* * Process deallocations that were held pending until the * inode had been written to disk. Freeing of the inode * is delayed until after all blocks have been freed to * avoid creation of new triples * before the old ones have been deleted. Completely * unlinked inodes are not processed until the unlinked * inode list is written or the last reference is removed. */ if ((inodedep->id_state & (UNLINKED | UNLINKONLIST)) != UNLINKED) { freefile = handle_bufwait(inodedep, NULL); if (freefile && !LIST_EMPTY(&wkhd)) { WORKLIST_INSERT(&wkhd, &freefile->fx_list); freefile = NULL; } } /* * Move rolled forward dependency completions to the bufwait list * now that those that were already written have been processed. */ if (!LIST_EMPTY(&wkhd) && hadchanges == 0) panic("handle_written_inodeblock: bufwait but no changes"); jwork_move(&inodedep->id_bufwait, &wkhd); if (freefile != NULL) { /* * If the inode is goingaway it was never written. Fake up * the state here so free_inodedep() can succeed. */ if (inodedep->id_state & GOINGAWAY) inodedep->id_state |= COMPLETE | DEPCOMPLETE; if (free_inodedep(inodedep) == 0) panic("handle_written_inodeblock: live inodedep %p", inodedep); add_to_worklist(&freefile->fx_list, 0); return (0); } /* * If no outstanding dependencies, free it. */ if (free_inodedep(inodedep) || (TAILQ_FIRST(&inodedep->id_inoreflst) == 0 && TAILQ_FIRST(&inodedep->id_inoupdt) == 0 && TAILQ_FIRST(&inodedep->id_extupdt) == 0 && LIST_FIRST(&inodedep->id_bufwait) == 0)) return (0); return (hadchanges); } /* * Perform needed roll-forwards and kick off any dependencies that * can now be processed. * * If the write did not succeed, we will do all the roll-forward * operations, but we will not take the actions that will allow its * dependencies to be processed. */ static int handle_written_indirdep(indirdep, bp, bpp, flags) struct indirdep *indirdep; struct buf *bp; struct buf **bpp; int flags; { struct allocindir *aip; struct buf *sbp; int chgs; if (indirdep->ir_state & GOINGAWAY) panic("handle_written_indirdep: indirdep gone"); if ((indirdep->ir_state & IOSTARTED) == 0) panic("handle_written_indirdep: IO not started"); chgs = 0; /* * If there were rollbacks revert them here. */ if (indirdep->ir_saveddata) { bcopy(indirdep->ir_saveddata, bp->b_data, bp->b_bcount); if (TAILQ_EMPTY(&indirdep->ir_trunc)) { free(indirdep->ir_saveddata, M_INDIRDEP); indirdep->ir_saveddata = NULL; } chgs = 1; } indirdep->ir_state &= ~(UNDONE | IOSTARTED); indirdep->ir_state |= ATTACHED; /* * If the write did not succeed, we have done all the roll-forward * operations, but we cannot take the actions that will allow its * dependencies to be processed. */ if ((flags & WRITESUCCEEDED) == 0) { stat_indir_blk_ptrs++; bdirty(bp); return (1); } /* * Move allocindirs with written pointers to the completehd if * the indirdep's pointer is not yet written. Otherwise * free them here. */ while ((aip = LIST_FIRST(&indirdep->ir_writehd)) != NULL) { LIST_REMOVE(aip, ai_next); if ((indirdep->ir_state & DEPCOMPLETE) == 0) { LIST_INSERT_HEAD(&indirdep->ir_completehd, aip, ai_next); newblk_freefrag(&aip->ai_block); continue; } free_newblk(&aip->ai_block); } /* * Move allocindirs that have finished dependency processing from * the done list to the write list after updating the pointers. */ if (TAILQ_EMPTY(&indirdep->ir_trunc)) { while ((aip = LIST_FIRST(&indirdep->ir_donehd)) != NULL) { handle_allocindir_partdone(aip); if (aip == LIST_FIRST(&indirdep->ir_donehd)) panic("disk_write_complete: not gone"); chgs = 1; } } /* * Preserve the indirdep if there were any changes or if it is not * yet valid on disk. */ if (chgs) { stat_indir_blk_ptrs++; bdirty(bp); return (1); } /* * If there were no changes we can discard the savedbp and detach * ourselves from the buf. We are only carrying completed pointers * in this case. */ sbp = indirdep->ir_savebp; sbp->b_flags |= B_INVAL | B_NOCACHE; indirdep->ir_savebp = NULL; indirdep->ir_bp = NULL; if (*bpp != NULL) panic("handle_written_indirdep: bp already exists."); *bpp = sbp; /* * The indirdep may not be freed until its parent points at it. */ if (indirdep->ir_state & DEPCOMPLETE) free_indirdep(indirdep); return (0); } /* * Process a diradd entry after its dependent inode has been written. */ static void diradd_inode_written(dap, inodedep) struct diradd *dap; struct inodedep *inodedep; { LOCK_OWNED(VFSTOUFS(dap->da_list.wk_mp)); dap->da_state |= COMPLETE; complete_diradd(dap); WORKLIST_INSERT(&inodedep->id_pendinghd, &dap->da_list); } /* * Returns true if the bmsafemap will have rollbacks when written. Must only * be called with the per-filesystem lock and the buf lock on the cg held. */ static int bmsafemap_backgroundwrite(bmsafemap, bp) struct bmsafemap *bmsafemap; struct buf *bp; { int dirty; LOCK_OWNED(VFSTOUFS(bmsafemap->sm_list.wk_mp)); dirty = !LIST_EMPTY(&bmsafemap->sm_jaddrefhd) | !LIST_EMPTY(&bmsafemap->sm_jnewblkhd); /* * If we're initiating a background write we need to process the * rollbacks as they exist now, not as they exist when IO starts. * No other consumers will look at the contents of the shadowed * buf so this is safe to do here. */ if (bp->b_xflags & BX_BKGRDMARKER) initiate_write_bmsafemap(bmsafemap, bp); return (dirty); } /* * Re-apply an allocation when a cg write is complete. */ static int jnewblk_rollforward(jnewblk, fs, cgp, blksfree) struct jnewblk *jnewblk; struct fs *fs; struct cg *cgp; uint8_t *blksfree; { ufs1_daddr_t fragno; ufs2_daddr_t blkno; long cgbno, bbase; int frags, blk; int i; frags = 0; cgbno = dtogd(fs, jnewblk->jn_blkno); for (i = jnewblk->jn_oldfrags; i < jnewblk->jn_frags; i++) { if (isclr(blksfree, cgbno + i)) panic("jnewblk_rollforward: re-allocated fragment"); frags++; } if (frags == fs->fs_frag) { blkno = fragstoblks(fs, cgbno); ffs_clrblock(fs, blksfree, (long)blkno); ffs_clusteracct(fs, cgp, blkno, -1); cgp->cg_cs.cs_nbfree--; } else { bbase = cgbno - fragnum(fs, cgbno); cgbno += jnewblk->jn_oldfrags; /* If a complete block had been reassembled, account for it. */ fragno = fragstoblks(fs, bbase); if (ffs_isblock(fs, blksfree, fragno)) { cgp->cg_cs.cs_nffree += fs->fs_frag; ffs_clusteracct(fs, cgp, fragno, -1); cgp->cg_cs.cs_nbfree--; } /* Decrement the old frags. */ blk = blkmap(fs, blksfree, bbase); ffs_fragacct(fs, blk, cgp->cg_frsum, -1); /* Allocate the fragment */ for (i = 0; i < frags; i++) clrbit(blksfree, cgbno + i); cgp->cg_cs.cs_nffree -= frags; /* Add back in counts associated with the new frags */ blk = blkmap(fs, blksfree, bbase); ffs_fragacct(fs, blk, cgp->cg_frsum, 1); } return (frags); } /* * Complete a write to a bmsafemap structure. Roll forward any bitmap * changes if it's not a background write. Set all written dependencies * to DEPCOMPLETE and free the structure if possible. * * If the write did not succeed, we will do all the roll-forward * operations, but we will not take the actions that will allow its * dependencies to be processed. */ static int handle_written_bmsafemap(bmsafemap, bp, flags) struct bmsafemap *bmsafemap; struct buf *bp; int flags; { struct newblk *newblk; struct inodedep *inodedep; struct jaddref *jaddref, *jatmp; struct jnewblk *jnewblk, *jntmp; struct ufsmount *ump; uint8_t *inosused; uint8_t *blksfree; struct cg *cgp; struct fs *fs; ino_t ino; int foreground; int chgs; if ((bmsafemap->sm_state & IOSTARTED) == 0) panic("handle_written_bmsafemap: Not started\n"); ump = VFSTOUFS(bmsafemap->sm_list.wk_mp); chgs = 0; bmsafemap->sm_state &= ~IOSTARTED; foreground = (bp->b_xflags & BX_BKGRDMARKER) == 0; /* * If write was successful, release journal work that was waiting * on the write. Otherwise move the work back. */ if (flags & WRITESUCCEEDED) handle_jwork(&bmsafemap->sm_freewr); else LIST_CONCAT(&bmsafemap->sm_freehd, &bmsafemap->sm_freewr, worklist, wk_list); /* * Restore unwritten inode allocation pending jaddref writes. */ if (!LIST_EMPTY(&bmsafemap->sm_jaddrefhd)) { cgp = (struct cg *)bp->b_data; fs = VFSTOUFS(bmsafemap->sm_list.wk_mp)->um_fs; inosused = cg_inosused(cgp); LIST_FOREACH_SAFE(jaddref, &bmsafemap->sm_jaddrefhd, ja_bmdeps, jatmp) { if ((jaddref->ja_state & UNDONE) == 0) continue; ino = jaddref->ja_ino % fs->fs_ipg; if (isset(inosused, ino)) panic("handle_written_bmsafemap: " "re-allocated inode"); /* Do the roll-forward only if it's a real copy. */ if (foreground) { if ((jaddref->ja_mode & IFMT) == IFDIR) cgp->cg_cs.cs_ndir++; cgp->cg_cs.cs_nifree--; setbit(inosused, ino); chgs = 1; } jaddref->ja_state &= ~UNDONE; jaddref->ja_state |= ATTACHED; free_jaddref(jaddref); } } /* * Restore any block allocations which are pending journal writes. */ if (LIST_FIRST(&bmsafemap->sm_jnewblkhd) != NULL) { cgp = (struct cg *)bp->b_data; fs = VFSTOUFS(bmsafemap->sm_list.wk_mp)->um_fs; blksfree = cg_blksfree(cgp); LIST_FOREACH_SAFE(jnewblk, &bmsafemap->sm_jnewblkhd, jn_deps, jntmp) { if ((jnewblk->jn_state & UNDONE) == 0) continue; /* Do the roll-forward only if it's a real copy. */ if (foreground && jnewblk_rollforward(jnewblk, fs, cgp, blksfree)) chgs = 1; jnewblk->jn_state &= ~(UNDONE | NEWBLOCK); jnewblk->jn_state |= ATTACHED; free_jnewblk(jnewblk); } } /* * If the write did not succeed, we have done all the roll-forward * operations, but we cannot take the actions that will allow its * dependencies to be processed. */ if ((flags & WRITESUCCEEDED) == 0) { LIST_CONCAT(&bmsafemap->sm_newblkhd, &bmsafemap->sm_newblkwr, newblk, nb_deps); LIST_CONCAT(&bmsafemap->sm_freehd, &bmsafemap->sm_freewr, worklist, wk_list); if (foreground) bdirty(bp); return (1); } while ((newblk = LIST_FIRST(&bmsafemap->sm_newblkwr))) { newblk->nb_state |= DEPCOMPLETE; newblk->nb_state &= ~ONDEPLIST; newblk->nb_bmsafemap = NULL; LIST_REMOVE(newblk, nb_deps); if (newblk->nb_list.wk_type == D_ALLOCDIRECT) handle_allocdirect_partdone( WK_ALLOCDIRECT(&newblk->nb_list), NULL); else if (newblk->nb_list.wk_type == D_ALLOCINDIR) handle_allocindir_partdone( WK_ALLOCINDIR(&newblk->nb_list)); else if (newblk->nb_list.wk_type != D_NEWBLK) panic("handle_written_bmsafemap: Unexpected type: %s", TYPENAME(newblk->nb_list.wk_type)); } while ((inodedep = LIST_FIRST(&bmsafemap->sm_inodedepwr)) != NULL) { inodedep->id_state |= DEPCOMPLETE; inodedep->id_state &= ~ONDEPLIST; LIST_REMOVE(inodedep, id_deps); inodedep->id_bmsafemap = NULL; } LIST_REMOVE(bmsafemap, sm_next); if (chgs == 0 && LIST_EMPTY(&bmsafemap->sm_jaddrefhd) && LIST_EMPTY(&bmsafemap->sm_jnewblkhd) && LIST_EMPTY(&bmsafemap->sm_newblkhd) && LIST_EMPTY(&bmsafemap->sm_inodedephd) && LIST_EMPTY(&bmsafemap->sm_freehd)) { LIST_REMOVE(bmsafemap, sm_hash); WORKITEM_FREE(bmsafemap, D_BMSAFEMAP); return (0); } LIST_INSERT_HEAD(&ump->softdep_dirtycg, bmsafemap, sm_next); if (foreground) bdirty(bp); return (1); } /* * Try to free a mkdir dependency. */ static void complete_mkdir(mkdir) struct mkdir *mkdir; { struct diradd *dap; if ((mkdir->md_state & ALLCOMPLETE) != ALLCOMPLETE) return; LIST_REMOVE(mkdir, md_mkdirs); dap = mkdir->md_diradd; dap->da_state &= ~(mkdir->md_state & (MKDIR_PARENT | MKDIR_BODY)); if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) == 0) { dap->da_state |= DEPCOMPLETE; complete_diradd(dap); } WORKITEM_FREE(mkdir, D_MKDIR); } /* * Handle the completion of a mkdir dependency. */ static void handle_written_mkdir(mkdir, type) struct mkdir *mkdir; int type; { if ((mkdir->md_state & (MKDIR_PARENT | MKDIR_BODY)) != type) panic("handle_written_mkdir: bad type"); mkdir->md_state |= COMPLETE; complete_mkdir(mkdir); } static int free_pagedep(pagedep) struct pagedep *pagedep; { int i; if (pagedep->pd_state & NEWBLOCK) return (0); if (!LIST_EMPTY(&pagedep->pd_dirremhd)) return (0); for (i = 0; i < DAHASHSZ; i++) if (!LIST_EMPTY(&pagedep->pd_diraddhd[i])) return (0); if (!LIST_EMPTY(&pagedep->pd_pendinghd)) return (0); if (!LIST_EMPTY(&pagedep->pd_jmvrefhd)) return (0); if (pagedep->pd_state & ONWORKLIST) WORKLIST_REMOVE(&pagedep->pd_list); LIST_REMOVE(pagedep, pd_hash); WORKITEM_FREE(pagedep, D_PAGEDEP); return (1); } /* * Called from within softdep_disk_write_complete above. * A write operation was just completed. Removed inodes can * now be freed and associated block pointers may be committed. * Note that this routine is always called from interrupt level * with further interrupts from this device blocked. * * If the write did not succeed, we will do all the roll-forward * operations, but we will not take the actions that will allow its * dependencies to be processed. */ static int handle_written_filepage(pagedep, bp, flags) struct pagedep *pagedep; struct buf *bp; /* buffer containing the written page */ int flags; { struct dirrem *dirrem; struct diradd *dap, *nextdap; struct direct *ep; int i, chgs; if ((pagedep->pd_state & IOSTARTED) == 0) panic("handle_written_filepage: not started"); pagedep->pd_state &= ~IOSTARTED; if ((flags & WRITESUCCEEDED) == 0) goto rollforward; /* * Process any directory removals that have been committed. */ while ((dirrem = LIST_FIRST(&pagedep->pd_dirremhd)) != NULL) { LIST_REMOVE(dirrem, dm_next); dirrem->dm_state |= COMPLETE; dirrem->dm_dirinum = pagedep->pd_ino; KASSERT(LIST_EMPTY(&dirrem->dm_jremrefhd), ("handle_written_filepage: Journal entries not written.")); add_to_worklist(&dirrem->dm_list, 0); } /* * Free any directory additions that have been committed. * If it is a newly allocated block, we have to wait until * the on-disk directory inode claims the new block. */ if ((pagedep->pd_state & NEWBLOCK) == 0) while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != NULL) free_diradd(dap, NULL); rollforward: /* * Uncommitted directory entries must be restored. */ for (chgs = 0, i = 0; i < DAHASHSZ; i++) { for (dap = LIST_FIRST(&pagedep->pd_diraddhd[i]); dap; dap = nextdap) { nextdap = LIST_NEXT(dap, da_pdlist); if (dap->da_state & ATTACHED) panic("handle_written_filepage: attached"); ep = (struct direct *) ((char *)bp->b_data + dap->da_offset); ep->d_ino = dap->da_newinum; dap->da_state &= ~UNDONE; dap->da_state |= ATTACHED; chgs = 1; /* * If the inode referenced by the directory has * been written out, then the dependency can be * moved to the pending list. */ if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) { LIST_REMOVE(dap, da_pdlist); LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist); } } } /* * If there were any rollbacks in the directory, then it must be * marked dirty so that its will eventually get written back in * its correct form. */ if (chgs || (flags & WRITESUCCEEDED) == 0) { if ((bp->b_flags & B_DELWRI) == 0) stat_dir_entry++; bdirty(bp); return (1); } /* * If we are not waiting for a new directory block to be * claimed by its inode, then the pagedep will be freed. * Otherwise it will remain to track any new entries on * the page in case they are fsync'ed. */ free_pagedep(pagedep); return (0); } /* * Writing back in-core inode structures. * * The filesystem only accesses an inode's contents when it occupies an * "in-core" inode structure. These "in-core" structures are separate from * the page frames used to cache inode blocks. Only the latter are * transferred to/from the disk. So, when the updated contents of the * "in-core" inode structure are copied to the corresponding in-memory inode * block, the dependencies are also transferred. The following procedure is * called when copying a dirty "in-core" inode to a cached inode block. */ /* * Called when an inode is loaded from disk. If the effective link count * differed from the actual link count when it was last flushed, then we * need to ensure that the correct effective link count is put back. */ void softdep_load_inodeblock(ip) struct inode *ip; /* the "in_core" copy of the inode */ { struct inodedep *inodedep; struct ufsmount *ump; ump = ITOUMP(ip); KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(ump)) != 0, ("softdep_load_inodeblock called on non-softdep filesystem")); /* * Check for alternate nlink count. */ ip->i_effnlink = ip->i_nlink; ACQUIRE_LOCK(ump); if (inodedep_lookup(UFSTOVFS(ump), ip->i_number, 0, &inodedep) == 0) { FREE_LOCK(ump); return; } if (ip->i_nlink != inodedep->id_nlinkwrote && inodedep->id_nlinkwrote != -1) { KASSERT(ip->i_nlink == 0 && (ump->um_flags & UM_FSFAIL_CLEANUP) != 0, ("read bad i_nlink value")); ip->i_effnlink = ip->i_nlink = inodedep->id_nlinkwrote; } ip->i_effnlink -= inodedep->id_nlinkdelta; KASSERT(ip->i_effnlink >= 0, ("softdep_load_inodeblock: negative i_effnlink")); FREE_LOCK(ump); } /* * This routine is called just before the "in-core" inode * information is to be copied to the in-memory inode block. * Recall that an inode block contains several inodes. If * the force flag is set, then the dependencies will be * cleared so that the update can always be made. Note that * the buffer is locked when this routine is called, so we * will never be in the middle of writing the inode block * to disk. */ void softdep_update_inodeblock(ip, bp, waitfor) struct inode *ip; /* the "in_core" copy of the inode */ struct buf *bp; /* the buffer containing the inode block */ int waitfor; /* nonzero => update must be allowed */ { struct inodedep *inodedep; struct inoref *inoref; struct ufsmount *ump; struct worklist *wk; struct mount *mp; struct buf *ibp; struct fs *fs; int error; ump = ITOUMP(ip); mp = UFSTOVFS(ump); KASSERT(MOUNTEDSOFTDEP(mp) != 0, ("softdep_update_inodeblock called on non-softdep filesystem")); fs = ump->um_fs; /* * Preserve the freelink that is on disk. clear_unlinked_inodedep() * does not have access to the in-core ip so must write directly into * the inode block buffer when setting freelink. */ if (fs->fs_magic == FS_UFS1_MAGIC) DIP_SET(ip, i_freelink, ((struct ufs1_dinode *)bp->b_data + ino_to_fsbo(fs, ip->i_number))->di_freelink); else DIP_SET(ip, i_freelink, ((struct ufs2_dinode *)bp->b_data + ino_to_fsbo(fs, ip->i_number))->di_freelink); /* * If the effective link count is not equal to the actual link * count, then we must track the difference in an inodedep while * the inode is (potentially) tossed out of the cache. Otherwise, * if there is no existing inodedep, then there are no dependencies * to track. */ ACQUIRE_LOCK(ump); again: if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0) { FREE_LOCK(ump); if (ip->i_effnlink != ip->i_nlink) panic("softdep_update_inodeblock: bad link count"); return; } KASSERT(ip->i_nlink >= inodedep->id_nlinkdelta, ("softdep_update_inodeblock inconsistent ip %p i_nlink %d " "inodedep %p id_nlinkdelta %jd", ip, ip->i_nlink, inodedep, (intmax_t)inodedep->id_nlinkdelta)); inodedep->id_nlinkwrote = ip->i_nlink; if (inodedep->id_nlinkdelta != ip->i_nlink - ip->i_effnlink) panic("softdep_update_inodeblock: bad delta"); /* * If we're flushing all dependencies we must also move any waiting * for journal writes onto the bufwait list prior to I/O. */ if (waitfor) { TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) { if ((inoref->if_state & (DEPCOMPLETE | GOINGAWAY)) == DEPCOMPLETE) { jwait(&inoref->if_list, MNT_WAIT); goto again; } } } /* * Changes have been initiated. Anything depending on these * changes cannot occur until this inode has been written. */ inodedep->id_state &= ~COMPLETE; if ((inodedep->id_state & ONWORKLIST) == 0) WORKLIST_INSERT(&bp->b_dep, &inodedep->id_list); /* * Any new dependencies associated with the incore inode must * now be moved to the list associated with the buffer holding * the in-memory copy of the inode. Once merged process any * allocdirects that are completed by the merger. */ merge_inode_lists(&inodedep->id_newinoupdt, &inodedep->id_inoupdt); if (!TAILQ_EMPTY(&inodedep->id_inoupdt)) handle_allocdirect_partdone(TAILQ_FIRST(&inodedep->id_inoupdt), NULL); merge_inode_lists(&inodedep->id_newextupdt, &inodedep->id_extupdt); if (!TAILQ_EMPTY(&inodedep->id_extupdt)) handle_allocdirect_partdone(TAILQ_FIRST(&inodedep->id_extupdt), NULL); /* * Now that the inode has been pushed into the buffer, the * operations dependent on the inode being written to disk * can be moved to the id_bufwait so that they will be * processed when the buffer I/O completes. */ while ((wk = LIST_FIRST(&inodedep->id_inowait)) != NULL) { WORKLIST_REMOVE(wk); WORKLIST_INSERT(&inodedep->id_bufwait, wk); } /* * Newly allocated inodes cannot be written until the bitmap * that allocates them have been written (indicated by * DEPCOMPLETE being set in id_state). If we are doing a * forced sync (e.g., an fsync on a file), we force the bitmap * to be written so that the update can be done. */ if (waitfor == 0) { FREE_LOCK(ump); return; } retry: if ((inodedep->id_state & (DEPCOMPLETE | GOINGAWAY)) != 0) { FREE_LOCK(ump); return; } ibp = inodedep->id_bmsafemap->sm_buf; ibp = getdirtybuf(ibp, LOCK_PTR(ump), MNT_WAIT); if (ibp == NULL) { /* * If ibp came back as NULL, the dependency could have been * freed while we slept. Look it up again, and check to see * that it has completed. */ if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) != 0) goto retry; FREE_LOCK(ump); return; } FREE_LOCK(ump); if ((error = bwrite(ibp)) != 0) softdep_error("softdep_update_inodeblock: bwrite", error); } /* * Merge the a new inode dependency list (such as id_newinoupdt) into an * old inode dependency list (such as id_inoupdt). */ static void merge_inode_lists(newlisthead, oldlisthead) struct allocdirectlst *newlisthead; struct allocdirectlst *oldlisthead; { struct allocdirect *listadp, *newadp; newadp = TAILQ_FIRST(newlisthead); if (newadp != NULL) LOCK_OWNED(VFSTOUFS(newadp->ad_block.nb_list.wk_mp)); for (listadp = TAILQ_FIRST(oldlisthead); listadp && newadp;) { if (listadp->ad_offset < newadp->ad_offset) { listadp = TAILQ_NEXT(listadp, ad_next); continue; } TAILQ_REMOVE(newlisthead, newadp, ad_next); TAILQ_INSERT_BEFORE(listadp, newadp, ad_next); if (listadp->ad_offset == newadp->ad_offset) { allocdirect_merge(oldlisthead, newadp, listadp); listadp = newadp; } newadp = TAILQ_FIRST(newlisthead); } while ((newadp = TAILQ_FIRST(newlisthead)) != NULL) { TAILQ_REMOVE(newlisthead, newadp, ad_next); TAILQ_INSERT_TAIL(oldlisthead, newadp, ad_next); } } /* * If we are doing an fsync, then we must ensure that any directory * entries for the inode have been written after the inode gets to disk. */ int softdep_fsync(vp) struct vnode *vp; /* the "in_core" copy of the inode */ { struct inodedep *inodedep; struct pagedep *pagedep; struct inoref *inoref; struct ufsmount *ump; struct worklist *wk; struct diradd *dap; struct mount *mp; struct vnode *pvp; struct inode *ip; struct buf *bp; struct fs *fs; struct thread *td = curthread; int error, flushparent, pagedep_new_block; ino_t parentino; ufs_lbn_t lbn; ip = VTOI(vp); mp = vp->v_mount; ump = VFSTOUFS(mp); fs = ump->um_fs; if (MOUNTEDSOFTDEP(mp) == 0) return (0); ACQUIRE_LOCK(ump); restart: if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0) { FREE_LOCK(ump); return (0); } TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) { if ((inoref->if_state & (DEPCOMPLETE | GOINGAWAY)) == DEPCOMPLETE) { jwait(&inoref->if_list, MNT_WAIT); goto restart; } } if (!LIST_EMPTY(&inodedep->id_inowait) || !TAILQ_EMPTY(&inodedep->id_extupdt) || !TAILQ_EMPTY(&inodedep->id_newextupdt) || !TAILQ_EMPTY(&inodedep->id_inoupdt) || !TAILQ_EMPTY(&inodedep->id_newinoupdt)) panic("softdep_fsync: pending ops %p", inodedep); for (error = 0, flushparent = 0; ; ) { if ((wk = LIST_FIRST(&inodedep->id_pendinghd)) == NULL) break; if (wk->wk_type != D_DIRADD) panic("softdep_fsync: Unexpected type %s", TYPENAME(wk->wk_type)); dap = WK_DIRADD(wk); /* * Flush our parent if this directory entry has a MKDIR_PARENT * dependency or is contained in a newly allocated block. */ if (dap->da_state & DIRCHG) pagedep = dap->da_previous->dm_pagedep; else pagedep = dap->da_pagedep; parentino = pagedep->pd_ino; lbn = pagedep->pd_lbn; if ((dap->da_state & (MKDIR_BODY | COMPLETE)) != COMPLETE) panic("softdep_fsync: dirty"); if ((dap->da_state & MKDIR_PARENT) || (pagedep->pd_state & NEWBLOCK)) flushparent = 1; else flushparent = 0; /* * If we are being fsync'ed as part of vgone'ing this vnode, * then we will not be able to release and recover the * vnode below, so we just have to give up on writing its * directory entry out. It will eventually be written, just * not now, but then the user was not asking to have it * written, so we are not breaking any promises. */ if (VN_IS_DOOMED(vp)) break; /* * We prevent deadlock by always fetching inodes from the * root, moving down the directory tree. Thus, when fetching * our parent directory, we first try to get the lock. If * that fails, we must unlock ourselves before requesting * the lock on our parent. See the comment in ufs_lookup * for details on possible races. */ FREE_LOCK(ump); if (ffs_vgetf(mp, parentino, LK_NOWAIT | LK_EXCLUSIVE, &pvp, FFSV_FORCEINSMQ)) { /* * Unmount cannot proceed after unlock because * caller must have called vn_start_write(). */ VOP_UNLOCK(vp); error = ffs_vgetf(mp, parentino, LK_EXCLUSIVE, &pvp, FFSV_FORCEINSMQ); MPASS(VTOI(pvp)->i_mode != 0); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); if (VN_IS_DOOMED(vp)) { if (error == 0) vput(pvp); error = ENOENT; } if (error != 0) return (error); } /* * All MKDIR_PARENT dependencies and all the NEWBLOCK pagedeps * that are contained in direct blocks will be resolved by * doing a ffs_update. Pagedeps contained in indirect blocks * may require a complete sync'ing of the directory. So, we * try the cheap and fast ffs_update first, and if that fails, * then we do the slower ffs_syncvnode of the directory. */ if (flushparent) { int locked; if ((error = ffs_update(pvp, 1)) != 0) { vput(pvp); return (error); } ACQUIRE_LOCK(ump); locked = 1; if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) != 0) { if ((wk = LIST_FIRST(&inodedep->id_pendinghd)) != NULL) { if (wk->wk_type != D_DIRADD) panic("softdep_fsync: Unexpected type %s", TYPENAME(wk->wk_type)); dap = WK_DIRADD(wk); if (dap->da_state & DIRCHG) pagedep = dap->da_previous->dm_pagedep; else pagedep = dap->da_pagedep; pagedep_new_block = pagedep->pd_state & NEWBLOCK; FREE_LOCK(ump); locked = 0; if (pagedep_new_block && (error = ffs_syncvnode(pvp, MNT_WAIT, 0))) { vput(pvp); return (error); } } } if (locked) FREE_LOCK(ump); } /* * Flush directory page containing the inode's name. */ error = bread(pvp, lbn, blksize(fs, VTOI(pvp), lbn), td->td_ucred, &bp); if (error == 0) error = bwrite(bp); else brelse(bp); vput(pvp); if (!ffs_fsfail_cleanup(ump, error)) return (error); ACQUIRE_LOCK(ump); if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0) break; } FREE_LOCK(ump); return (0); } /* * Flush all the dirty bitmaps associated with the block device * before flushing the rest of the dirty blocks so as to reduce * the number of dependencies that will have to be rolled back. * * XXX Unused? */ void softdep_fsync_mountdev(vp) struct vnode *vp; { struct buf *bp, *nbp; struct worklist *wk; struct bufobj *bo; - if (!vn_isdisk(vp, NULL)) + if (!vn_isdisk(vp)) panic("softdep_fsync_mountdev: vnode not a disk"); bo = &vp->v_bufobj; restart: BO_LOCK(bo); TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) { /* * If it is already scheduled, skip to the next buffer. */ if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL)) continue; if ((bp->b_flags & B_DELWRI) == 0) panic("softdep_fsync_mountdev: not dirty"); /* * We are only interested in bitmaps with outstanding * dependencies. */ if ((wk = LIST_FIRST(&bp->b_dep)) == NULL || wk->wk_type != D_BMSAFEMAP || (bp->b_vflags & BV_BKGRDINPROG)) { BUF_UNLOCK(bp); continue; } BO_UNLOCK(bo); bremfree(bp); (void) bawrite(bp); goto restart; } drain_output(vp); BO_UNLOCK(bo); } /* * Sync all cylinder groups that were dirty at the time this function is * called. Newly dirtied cgs will be inserted before the sentinel. This * is used to flush freedep activity that may be holding up writes to a * indirect block. */ static int sync_cgs(mp, waitfor) struct mount *mp; int waitfor; { struct bmsafemap *bmsafemap; struct bmsafemap *sentinel; struct ufsmount *ump; struct buf *bp; int error; sentinel = malloc(sizeof(*sentinel), M_BMSAFEMAP, M_ZERO | M_WAITOK); sentinel->sm_cg = -1; ump = VFSTOUFS(mp); error = 0; ACQUIRE_LOCK(ump); LIST_INSERT_HEAD(&ump->softdep_dirtycg, sentinel, sm_next); for (bmsafemap = LIST_NEXT(sentinel, sm_next); bmsafemap != NULL; bmsafemap = LIST_NEXT(sentinel, sm_next)) { /* Skip sentinels and cgs with no work to release. */ if (bmsafemap->sm_cg == -1 || (LIST_EMPTY(&bmsafemap->sm_freehd) && LIST_EMPTY(&bmsafemap->sm_freewr))) { LIST_REMOVE(sentinel, sm_next); LIST_INSERT_AFTER(bmsafemap, sentinel, sm_next); continue; } /* * If we don't get the lock and we're waiting try again, if * not move on to the next buf and try to sync it. */ bp = getdirtybuf(bmsafemap->sm_buf, LOCK_PTR(ump), waitfor); if (bp == NULL && waitfor == MNT_WAIT) continue; LIST_REMOVE(sentinel, sm_next); LIST_INSERT_AFTER(bmsafemap, sentinel, sm_next); if (bp == NULL) continue; FREE_LOCK(ump); if (waitfor == MNT_NOWAIT) bawrite(bp); else error = bwrite(bp); ACQUIRE_LOCK(ump); if (error) break; } LIST_REMOVE(sentinel, sm_next); FREE_LOCK(ump); free(sentinel, M_BMSAFEMAP); return (error); } /* * This routine is called when we are trying to synchronously flush a * file. This routine must eliminate any filesystem metadata dependencies * so that the syncing routine can succeed. */ int softdep_sync_metadata(struct vnode *vp) { struct inode *ip; int error; ip = VTOI(vp); KASSERT(MOUNTEDSOFTDEP(vp->v_mount) != 0, ("softdep_sync_metadata called on non-softdep filesystem")); /* * Ensure that any direct block dependencies have been cleared, * truncations are started, and inode references are journaled. */ ACQUIRE_LOCK(VFSTOUFS(vp->v_mount)); /* * Write all journal records to prevent rollbacks on devvp. */ if (vp->v_type == VCHR) softdep_flushjournal(vp->v_mount); error = flush_inodedep_deps(vp, vp->v_mount, ip->i_number); /* * Ensure that all truncates are written so we won't find deps on * indirect blocks. */ process_truncates(vp); FREE_LOCK(VFSTOUFS(vp->v_mount)); return (error); } /* * This routine is called when we are attempting to sync a buf with * dependencies. If waitfor is MNT_NOWAIT it attempts to schedule any * other IO it can but returns EBUSY if the buffer is not yet able to * be written. Dependencies which will not cause rollbacks will always * return 0. */ int softdep_sync_buf(struct vnode *vp, struct buf *bp, int waitfor) { struct indirdep *indirdep; struct pagedep *pagedep; struct allocindir *aip; struct newblk *newblk; struct ufsmount *ump; struct buf *nbp; struct worklist *wk; int i, error; KASSERT(MOUNTEDSOFTDEP(vp->v_mount) != 0, ("softdep_sync_buf called on non-softdep filesystem")); /* * For VCHR we just don't want to force flush any dependencies that * will cause rollbacks. */ if (vp->v_type == VCHR) { if (waitfor == MNT_NOWAIT && softdep_count_dependencies(bp, 0)) return (EBUSY); return (0); } ump = VFSTOUFS(vp->v_mount); ACQUIRE_LOCK(ump); /* * As we hold the buffer locked, none of its dependencies * will disappear. */ error = 0; top: LIST_FOREACH(wk, &bp->b_dep, wk_list) { switch (wk->wk_type) { case D_ALLOCDIRECT: case D_ALLOCINDIR: newblk = WK_NEWBLK(wk); if (newblk->nb_jnewblk != NULL) { if (waitfor == MNT_NOWAIT) { error = EBUSY; goto out_unlock; } jwait(&newblk->nb_jnewblk->jn_list, waitfor); goto top; } if (newblk->nb_state & DEPCOMPLETE || waitfor == MNT_NOWAIT) continue; nbp = newblk->nb_bmsafemap->sm_buf; nbp = getdirtybuf(nbp, LOCK_PTR(ump), waitfor); if (nbp == NULL) goto top; FREE_LOCK(ump); if ((error = bwrite(nbp)) != 0) goto out; ACQUIRE_LOCK(ump); continue; case D_INDIRDEP: indirdep = WK_INDIRDEP(wk); if (waitfor == MNT_NOWAIT) { if (!TAILQ_EMPTY(&indirdep->ir_trunc) || !LIST_EMPTY(&indirdep->ir_deplisthd)) { error = EBUSY; goto out_unlock; } } if (!TAILQ_EMPTY(&indirdep->ir_trunc)) panic("softdep_sync_buf: truncation pending."); restart: LIST_FOREACH(aip, &indirdep->ir_deplisthd, ai_next) { newblk = (struct newblk *)aip; if (newblk->nb_jnewblk != NULL) { jwait(&newblk->nb_jnewblk->jn_list, waitfor); goto restart; } if (newblk->nb_state & DEPCOMPLETE) continue; nbp = newblk->nb_bmsafemap->sm_buf; nbp = getdirtybuf(nbp, LOCK_PTR(ump), waitfor); if (nbp == NULL) goto restart; FREE_LOCK(ump); if ((error = bwrite(nbp)) != 0) goto out; ACQUIRE_LOCK(ump); goto restart; } continue; case D_PAGEDEP: /* * Only flush directory entries in synchronous passes. */ if (waitfor != MNT_WAIT) { error = EBUSY; goto out_unlock; } /* * While syncing snapshots, we must allow recursive * lookups. */ BUF_AREC(bp); /* * We are trying to sync a directory that may * have dependencies on both its own metadata * and/or dependencies on the inodes of any * recently allocated files. We walk its diradd * lists pushing out the associated inode. */ pagedep = WK_PAGEDEP(wk); for (i = 0; i < DAHASHSZ; i++) { if (LIST_FIRST(&pagedep->pd_diraddhd[i]) == 0) continue; if ((error = flush_pagedep_deps(vp, wk->wk_mp, &pagedep->pd_diraddhd[i]))) { BUF_NOREC(bp); goto out_unlock; } } BUF_NOREC(bp); continue; case D_FREEWORK: case D_FREEDEP: case D_JSEGDEP: case D_JNEWBLK: continue; default: panic("softdep_sync_buf: Unknown type %s", TYPENAME(wk->wk_type)); /* NOTREACHED */ } } out_unlock: FREE_LOCK(ump); out: return (error); } /* * Flush the dependencies associated with an inodedep. */ static int flush_inodedep_deps(vp, mp, ino) struct vnode *vp; struct mount *mp; ino_t ino; { struct inodedep *inodedep; struct inoref *inoref; struct ufsmount *ump; int error, waitfor; /* * This work is done in two passes. The first pass grabs most * of the buffers and begins asynchronously writing them. The * only way to wait for these asynchronous writes is to sleep * on the filesystem vnode which may stay busy for a long time * if the filesystem is active. So, instead, we make a second * pass over the dependencies blocking on each write. In the * usual case we will be blocking against a write that we * initiated, so when it is done the dependency will have been * resolved. Thus the second pass is expected to end quickly. * We give a brief window at the top of the loop to allow * any pending I/O to complete. */ ump = VFSTOUFS(mp); LOCK_OWNED(ump); for (error = 0, waitfor = MNT_NOWAIT; ; ) { if (error) return (error); FREE_LOCK(ump); ACQUIRE_LOCK(ump); restart: if (inodedep_lookup(mp, ino, 0, &inodedep) == 0) return (0); TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) { if ((inoref->if_state & (DEPCOMPLETE | GOINGAWAY)) == DEPCOMPLETE) { jwait(&inoref->if_list, MNT_WAIT); goto restart; } } if (flush_deplist(&inodedep->id_inoupdt, waitfor, &error) || flush_deplist(&inodedep->id_newinoupdt, waitfor, &error) || flush_deplist(&inodedep->id_extupdt, waitfor, &error) || flush_deplist(&inodedep->id_newextupdt, waitfor, &error)) continue; /* * If pass2, we are done, otherwise do pass 2. */ if (waitfor == MNT_WAIT) break; waitfor = MNT_WAIT; } /* * Try freeing inodedep in case all dependencies have been removed. */ if (inodedep_lookup(mp, ino, 0, &inodedep) != 0) (void) free_inodedep(inodedep); return (0); } /* * Flush an inode dependency list. */ static int flush_deplist(listhead, waitfor, errorp) struct allocdirectlst *listhead; int waitfor; int *errorp; { struct allocdirect *adp; struct newblk *newblk; struct ufsmount *ump; struct buf *bp; if ((adp = TAILQ_FIRST(listhead)) == NULL) return (0); ump = VFSTOUFS(adp->ad_list.wk_mp); LOCK_OWNED(ump); TAILQ_FOREACH(adp, listhead, ad_next) { newblk = (struct newblk *)adp; if (newblk->nb_jnewblk != NULL) { jwait(&newblk->nb_jnewblk->jn_list, MNT_WAIT); return (1); } if (newblk->nb_state & DEPCOMPLETE) continue; bp = newblk->nb_bmsafemap->sm_buf; bp = getdirtybuf(bp, LOCK_PTR(ump), waitfor); if (bp == NULL) { if (waitfor == MNT_NOWAIT) continue; return (1); } FREE_LOCK(ump); if (waitfor == MNT_NOWAIT) bawrite(bp); else *errorp = bwrite(bp); ACQUIRE_LOCK(ump); return (1); } return (0); } /* * Flush dependencies associated with an allocdirect block. */ static int flush_newblk_dep(vp, mp, lbn) struct vnode *vp; struct mount *mp; ufs_lbn_t lbn; { struct newblk *newblk; struct ufsmount *ump; struct bufobj *bo; struct inode *ip; struct buf *bp; ufs2_daddr_t blkno; int error; error = 0; bo = &vp->v_bufobj; ip = VTOI(vp); blkno = DIP(ip, i_db[lbn]); if (blkno == 0) panic("flush_newblk_dep: Missing block"); ump = VFSTOUFS(mp); ACQUIRE_LOCK(ump); /* * Loop until all dependencies related to this block are satisfied. * We must be careful to restart after each sleep in case a write * completes some part of this process for us. */ for (;;) { if (newblk_lookup(mp, blkno, 0, &newblk) == 0) { FREE_LOCK(ump); break; } if (newblk->nb_list.wk_type != D_ALLOCDIRECT) panic("flush_newblk_dep: Bad newblk %p", newblk); /* * Flush the journal. */ if (newblk->nb_jnewblk != NULL) { jwait(&newblk->nb_jnewblk->jn_list, MNT_WAIT); continue; } /* * Write the bitmap dependency. */ if ((newblk->nb_state & DEPCOMPLETE) == 0) { bp = newblk->nb_bmsafemap->sm_buf; bp = getdirtybuf(bp, LOCK_PTR(ump), MNT_WAIT); if (bp == NULL) continue; FREE_LOCK(ump); error = bwrite(bp); if (error) break; ACQUIRE_LOCK(ump); continue; } /* * Write the buffer. */ FREE_LOCK(ump); BO_LOCK(bo); bp = gbincore(bo, lbn); if (bp != NULL) { error = BUF_LOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, BO_LOCKPTR(bo)); if (error == ENOLCK) { ACQUIRE_LOCK(ump); error = 0; continue; /* Slept, retry */ } if (error != 0) break; /* Failed */ if (bp->b_flags & B_DELWRI) { bremfree(bp); error = bwrite(bp); if (error) break; } else BUF_UNLOCK(bp); } else BO_UNLOCK(bo); /* * We have to wait for the direct pointers to * point at the newdirblk before the dependency * will go away. */ error = ffs_update(vp, 1); if (error) break; ACQUIRE_LOCK(ump); } return (error); } /* * Eliminate a pagedep dependency by flushing out all its diradd dependencies. */ static int flush_pagedep_deps(pvp, mp, diraddhdp) struct vnode *pvp; struct mount *mp; struct diraddhd *diraddhdp; { struct inodedep *inodedep; struct inoref *inoref; struct ufsmount *ump; struct diradd *dap; struct vnode *vp; int error = 0; struct buf *bp; ino_t inum; struct diraddhd unfinished; LIST_INIT(&unfinished); ump = VFSTOUFS(mp); LOCK_OWNED(ump); restart: while ((dap = LIST_FIRST(diraddhdp)) != NULL) { /* * Flush ourselves if this directory entry * has a MKDIR_PARENT dependency. */ if (dap->da_state & MKDIR_PARENT) { FREE_LOCK(ump); if ((error = ffs_update(pvp, 1)) != 0) break; ACQUIRE_LOCK(ump); /* * If that cleared dependencies, go on to next. */ if (dap != LIST_FIRST(diraddhdp)) continue; /* * All MKDIR_PARENT dependencies and all the * NEWBLOCK pagedeps that are contained in direct * blocks were resolved by doing above ffs_update. * Pagedeps contained in indirect blocks may * require a complete sync'ing of the directory. * We are in the midst of doing a complete sync, * so if they are not resolved in this pass we * defer them for now as they will be sync'ed by * our caller shortly. */ LIST_REMOVE(dap, da_pdlist); LIST_INSERT_HEAD(&unfinished, dap, da_pdlist); continue; } /* * A newly allocated directory must have its "." and * ".." entries written out before its name can be * committed in its parent. */ inum = dap->da_newinum; if (inodedep_lookup(UFSTOVFS(ump), inum, 0, &inodedep) == 0) panic("flush_pagedep_deps: lost inode1"); /* * Wait for any pending journal adds to complete so we don't * cause rollbacks while syncing. */ TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) { if ((inoref->if_state & (DEPCOMPLETE | GOINGAWAY)) == DEPCOMPLETE) { jwait(&inoref->if_list, MNT_WAIT); goto restart; } } if (dap->da_state & MKDIR_BODY) { FREE_LOCK(ump); if ((error = ffs_vgetf(mp, inum, LK_EXCLUSIVE, &vp, FFSV_FORCEINSMQ))) break; MPASS(VTOI(vp)->i_mode != 0); error = flush_newblk_dep(vp, mp, 0); /* * If we still have the dependency we might need to * update the vnode to sync the new link count to * disk. */ if (error == 0 && dap == LIST_FIRST(diraddhdp)) error = ffs_update(vp, 1); vput(vp); if (error != 0) break; ACQUIRE_LOCK(ump); /* * If that cleared dependencies, go on to next. */ if (dap != LIST_FIRST(diraddhdp)) continue; if (dap->da_state & MKDIR_BODY) { inodedep_lookup(UFSTOVFS(ump), inum, 0, &inodedep); panic("flush_pagedep_deps: MKDIR_BODY " "inodedep %p dap %p vp %p", inodedep, dap, vp); } } /* * Flush the inode on which the directory entry depends. * Having accounted for MKDIR_PARENT and MKDIR_BODY above, * the only remaining dependency is that the updated inode * count must get pushed to disk. The inode has already * been pushed into its inode buffer (via VOP_UPDATE) at * the time of the reference count change. So we need only * locate that buffer, ensure that there will be no rollback * caused by a bitmap dependency, then write the inode buffer. */ retry: if (inodedep_lookup(UFSTOVFS(ump), inum, 0, &inodedep) == 0) panic("flush_pagedep_deps: lost inode"); /* * If the inode still has bitmap dependencies, * push them to disk. */ if ((inodedep->id_state & (DEPCOMPLETE | GOINGAWAY)) == 0) { bp = inodedep->id_bmsafemap->sm_buf; bp = getdirtybuf(bp, LOCK_PTR(ump), MNT_WAIT); if (bp == NULL) goto retry; FREE_LOCK(ump); if ((error = bwrite(bp)) != 0) break; ACQUIRE_LOCK(ump); if (dap != LIST_FIRST(diraddhdp)) continue; } /* * If the inode is still sitting in a buffer waiting * to be written or waiting for the link count to be * adjusted update it here to flush it to disk. */ if (dap == LIST_FIRST(diraddhdp)) { FREE_LOCK(ump); if ((error = ffs_vgetf(mp, inum, LK_EXCLUSIVE, &vp, FFSV_FORCEINSMQ))) break; MPASS(VTOI(vp)->i_mode != 0); error = ffs_update(vp, 1); vput(vp); if (error) break; ACQUIRE_LOCK(ump); } /* * If we have failed to get rid of all the dependencies * then something is seriously wrong. */ if (dap == LIST_FIRST(diraddhdp)) { inodedep_lookup(UFSTOVFS(ump), inum, 0, &inodedep); panic("flush_pagedep_deps: failed to flush " "inodedep %p ino %ju dap %p", inodedep, (uintmax_t)inum, dap); } } if (error) ACQUIRE_LOCK(ump); while ((dap = LIST_FIRST(&unfinished)) != NULL) { LIST_REMOVE(dap, da_pdlist); LIST_INSERT_HEAD(diraddhdp, dap, da_pdlist); } return (error); } /* * A large burst of file addition or deletion activity can drive the * memory load excessively high. First attempt to slow things down * using the techniques below. If that fails, this routine requests * the offending operations to fall back to running synchronously * until the memory load returns to a reasonable level. */ int softdep_slowdown(vp) struct vnode *vp; { struct ufsmount *ump; int jlow; int max_softdeps_hard; KASSERT(MOUNTEDSOFTDEP(vp->v_mount) != 0, ("softdep_slowdown called on non-softdep filesystem")); ump = VFSTOUFS(vp->v_mount); ACQUIRE_LOCK(ump); jlow = 0; /* * Check for journal space if needed. */ if (DOINGSUJ(vp)) { if (journal_space(ump, 0) == 0) jlow = 1; } /* * If the system is under its limits and our filesystem is * not responsible for more than our share of the usage and * we are not low on journal space, then no need to slow down. */ max_softdeps_hard = max_softdeps * 11 / 10; if (dep_current[D_DIRREM] < max_softdeps_hard / 2 && dep_current[D_INODEDEP] < max_softdeps_hard && dep_current[D_INDIRDEP] < max_softdeps_hard / 1000 && dep_current[D_FREEBLKS] < max_softdeps_hard && jlow == 0 && ump->softdep_curdeps[D_DIRREM] < (max_softdeps_hard / 2) / stat_flush_threads && ump->softdep_curdeps[D_INODEDEP] < max_softdeps_hard / stat_flush_threads && ump->softdep_curdeps[D_INDIRDEP] < (max_softdeps_hard / 1000) / stat_flush_threads && ump->softdep_curdeps[D_FREEBLKS] < max_softdeps_hard / stat_flush_threads) { FREE_LOCK(ump); return (0); } /* * If the journal is low or our filesystem is over its limit * then speedup the cleanup. */ if (ump->softdep_curdeps[D_INDIRDEP] < (max_softdeps_hard / 1000) / stat_flush_threads || jlow) softdep_speedup(ump); stat_sync_limit_hit += 1; FREE_LOCK(ump); /* * We only slow down the rate at which new dependencies are * generated if we are not using journaling. With journaling, * the cleanup should always be sufficient to keep things * under control. */ if (DOINGSUJ(vp)) return (0); return (1); } /* * Called by the allocation routines when they are about to fail * in the hope that we can free up the requested resource (inodes * or disk space). * * First check to see if the work list has anything on it. If it has, * clean up entries until we successfully free the requested resource. * Because this process holds inodes locked, we cannot handle any remove * requests that might block on a locked inode as that could lead to * deadlock. If the worklist yields none of the requested resource, * start syncing out vnodes to free up the needed space. */ int softdep_request_cleanup(fs, vp, cred, resource) struct fs *fs; struct vnode *vp; struct ucred *cred; int resource; { struct ufsmount *ump; struct mount *mp; long starttime; ufs2_daddr_t needed; int error, failed_vnode; /* * If we are being called because of a process doing a * copy-on-write, then it is not safe to process any * worklist items as we will recurse into the copyonwrite * routine. This will result in an incoherent snapshot. * If the vnode that we hold is a snapshot, we must avoid * handling other resources that could cause deadlock. */ if ((curthread->td_pflags & TDP_COWINPROGRESS) || IS_SNAPSHOT(VTOI(vp))) return (0); if (resource == FLUSH_BLOCKS_WAIT) stat_cleanup_blkrequests += 1; else stat_cleanup_inorequests += 1; mp = vp->v_mount; ump = VFSTOUFS(mp); mtx_assert(UFS_MTX(ump), MA_OWNED); UFS_UNLOCK(ump); error = ffs_update(vp, 1); if (error != 0 || MOUNTEDSOFTDEP(mp) == 0) { UFS_LOCK(ump); return (0); } /* * If we are in need of resources, start by cleaning up * any block removals associated with our inode. */ ACQUIRE_LOCK(ump); process_removes(vp); process_truncates(vp); FREE_LOCK(ump); /* * Now clean up at least as many resources as we will need. * * When requested to clean up inodes, the number that are needed * is set by the number of simultaneous writers (mnt_writeopcount) * plus a bit of slop (2) in case some more writers show up while * we are cleaning. * * When requested to free up space, the amount of space that * we need is enough blocks to allocate a full-sized segment * (fs_contigsumsize). The number of such segments that will * be needed is set by the number of simultaneous writers * (mnt_writeopcount) plus a bit of slop (2) in case some more * writers show up while we are cleaning. * * Additionally, if we are unpriviledged and allocating space, * we need to ensure that we clean up enough blocks to get the * needed number of blocks over the threshold of the minimum * number of blocks required to be kept free by the filesystem * (fs_minfree). */ if (resource == FLUSH_INODES_WAIT) { needed = vfs_mount_fetch_counter(vp->v_mount, MNT_COUNT_WRITEOPCOUNT) + 2; } else if (resource == FLUSH_BLOCKS_WAIT) { needed = (vfs_mount_fetch_counter(vp->v_mount, MNT_COUNT_WRITEOPCOUNT) + 2) * fs->fs_contigsumsize; if (priv_check_cred(cred, PRIV_VFS_BLOCKRESERVE)) needed += fragstoblks(fs, roundup((fs->fs_dsize * fs->fs_minfree / 100) - fs->fs_cstotal.cs_nffree, fs->fs_frag)); } else { printf("softdep_request_cleanup: Unknown resource type %d\n", resource); UFS_LOCK(ump); return (0); } starttime = time_second; retry: if (resource == FLUSH_BLOCKS_WAIT && fs->fs_cstotal.cs_nbfree <= needed) softdep_send_speedup(ump, needed * fs->fs_bsize, BIO_SPEEDUP_TRIM); if ((resource == FLUSH_BLOCKS_WAIT && ump->softdep_on_worklist > 0 && fs->fs_cstotal.cs_nbfree <= needed) || (resource == FLUSH_INODES_WAIT && fs->fs_pendinginodes > 0 && fs->fs_cstotal.cs_nifree <= needed)) { ACQUIRE_LOCK(ump); if (ump->softdep_on_worklist > 0 && process_worklist_item(UFSTOVFS(ump), ump->softdep_on_worklist, LK_NOWAIT) != 0) stat_worklist_push += 1; FREE_LOCK(ump); } /* * If we still need resources and there are no more worklist * entries to process to obtain them, we have to start flushing * the dirty vnodes to force the release of additional requests * to the worklist that we can then process to reap addition * resources. We walk the vnodes associated with the mount point * until we get the needed worklist requests that we can reap. * * If there are several threads all needing to clean the same * mount point, only one is allowed to walk the mount list. * When several threads all try to walk the same mount list, * they end up competing with each other and often end up in * livelock. This approach ensures that forward progress is * made at the cost of occational ENOSPC errors being returned * that might otherwise have been avoided. */ error = 1; if ((resource == FLUSH_BLOCKS_WAIT && fs->fs_cstotal.cs_nbfree <= needed) || (resource == FLUSH_INODES_WAIT && fs->fs_pendinginodes > 0 && fs->fs_cstotal.cs_nifree <= needed)) { ACQUIRE_LOCK(ump); if ((ump->um_softdep->sd_flags & FLUSH_RC_ACTIVE) == 0) { ump->um_softdep->sd_flags |= FLUSH_RC_ACTIVE; FREE_LOCK(ump); failed_vnode = softdep_request_cleanup_flush(mp, ump); ACQUIRE_LOCK(ump); ump->um_softdep->sd_flags &= ~FLUSH_RC_ACTIVE; FREE_LOCK(ump); if (ump->softdep_on_worklist > 0) { stat_cleanup_retries += 1; if (!failed_vnode) goto retry; } } else { FREE_LOCK(ump); error = 0; } stat_cleanup_failures += 1; } if (time_second - starttime > stat_cleanup_high_delay) stat_cleanup_high_delay = time_second - starttime; UFS_LOCK(ump); return (error); } /* * Scan the vnodes for the specified mount point flushing out any * vnodes that can be locked without waiting. Finally, try to flush * the device associated with the mount point if it can be locked * without waiting. * * We return 0 if we were able to lock every vnode in our scan. * If we had to skip one or more vnodes, we return 1. */ static int softdep_request_cleanup_flush(mp, ump) struct mount *mp; struct ufsmount *ump; { struct thread *td; struct vnode *lvp, *mvp; int failed_vnode; failed_vnode = 0; td = curthread; MNT_VNODE_FOREACH_ALL(lvp, mp, mvp) { if (TAILQ_FIRST(&lvp->v_bufobj.bo_dirty.bv_hd) == 0) { VI_UNLOCK(lvp); continue; } if (vget(lvp, LK_EXCLUSIVE | LK_INTERLOCK | LK_NOWAIT) != 0) { failed_vnode = 1; continue; } if (lvp->v_vflag & VV_NOSYNC) { /* unlinked */ vput(lvp); continue; } (void) ffs_syncvnode(lvp, MNT_NOWAIT, 0); vput(lvp); } lvp = ump->um_devvp; if (vn_lock(lvp, LK_EXCLUSIVE | LK_NOWAIT) == 0) { VOP_FSYNC(lvp, MNT_NOWAIT, td); VOP_UNLOCK(lvp); } return (failed_vnode); } static bool softdep_excess_items(struct ufsmount *ump, int item) { KASSERT(item >= 0 && item < D_LAST, ("item %d", item)); return (dep_current[item] > max_softdeps && ump->softdep_curdeps[item] > max_softdeps / stat_flush_threads); } static void schedule_cleanup(struct mount *mp) { struct ufsmount *ump; struct thread *td; ump = VFSTOUFS(mp); LOCK_OWNED(ump); FREE_LOCK(ump); td = curthread; if ((td->td_pflags & TDP_KTHREAD) != 0 && (td->td_proc->p_flag2 & P2_AST_SU) == 0) { /* * No ast is delivered to kernel threads, so nobody * would deref the mp. Some kernel threads * explicitely check for AST, e.g. NFS daemon does * this in the serving loop. */ return; } if (td->td_su != NULL) vfs_rel(td->td_su); vfs_ref(mp); td->td_su = mp; thread_lock(td); td->td_flags |= TDF_ASTPENDING; thread_unlock(td); } static void softdep_ast_cleanup_proc(struct thread *td) { struct mount *mp; struct ufsmount *ump; int error; bool req; while ((mp = td->td_su) != NULL) { td->td_su = NULL; error = vfs_busy(mp, MBF_NOWAIT); vfs_rel(mp); if (error != 0) return; if (ffs_own_mount(mp) && MOUNTEDSOFTDEP(mp)) { ump = VFSTOUFS(mp); for (;;) { req = false; ACQUIRE_LOCK(ump); if (softdep_excess_items(ump, D_INODEDEP)) { req = true; request_cleanup(mp, FLUSH_INODES); } if (softdep_excess_items(ump, D_DIRREM)) { req = true; request_cleanup(mp, FLUSH_BLOCKS); } FREE_LOCK(ump); if (softdep_excess_items(ump, D_NEWBLK) || softdep_excess_items(ump, D_ALLOCDIRECT) || softdep_excess_items(ump, D_ALLOCINDIR)) { error = vn_start_write(NULL, &mp, V_WAIT); if (error == 0) { req = true; VFS_SYNC(mp, MNT_WAIT); vn_finished_write(mp); } } if ((td->td_pflags & TDP_KTHREAD) != 0 || !req) break; } } vfs_unbusy(mp); } if ((mp = td->td_su) != NULL) { td->td_su = NULL; vfs_rel(mp); } } /* * If memory utilization has gotten too high, deliberately slow things * down and speed up the I/O processing. */ static int request_cleanup(mp, resource) struct mount *mp; int resource; { struct thread *td = curthread; struct ufsmount *ump; ump = VFSTOUFS(mp); LOCK_OWNED(ump); /* * We never hold up the filesystem syncer or buf daemon. */ if (td->td_pflags & (TDP_SOFTDEP|TDP_NORUNNINGBUF)) return (0); /* * First check to see if the work list has gotten backlogged. * If it has, co-opt this process to help clean up two entries. * Because this process may hold inodes locked, we cannot * handle any remove requests that might block on a locked * inode as that could lead to deadlock. We set TDP_SOFTDEP * to avoid recursively processing the worklist. */ if (ump->softdep_on_worklist > max_softdeps / 10) { td->td_pflags |= TDP_SOFTDEP; process_worklist_item(mp, 2, LK_NOWAIT); td->td_pflags &= ~TDP_SOFTDEP; stat_worklist_push += 2; return(1); } /* * Next, we attempt to speed up the syncer process. If that * is successful, then we allow the process to continue. */ if (softdep_speedup(ump) && resource != FLUSH_BLOCKS_WAIT && resource != FLUSH_INODES_WAIT) return(0); /* * If we are resource constrained on inode dependencies, try * flushing some dirty inodes. Otherwise, we are constrained * by file deletions, so try accelerating flushes of directories * with removal dependencies. We would like to do the cleanup * here, but we probably hold an inode locked at this point and * that might deadlock against one that we try to clean. So, * the best that we can do is request the syncer daemon to do * the cleanup for us. */ switch (resource) { case FLUSH_INODES: case FLUSH_INODES_WAIT: ACQUIRE_GBLLOCK(&lk); stat_ino_limit_push += 1; req_clear_inodedeps += 1; FREE_GBLLOCK(&lk); stat_countp = &stat_ino_limit_hit; break; case FLUSH_BLOCKS: case FLUSH_BLOCKS_WAIT: ACQUIRE_GBLLOCK(&lk); stat_blk_limit_push += 1; req_clear_remove += 1; FREE_GBLLOCK(&lk); stat_countp = &stat_blk_limit_hit; break; default: panic("request_cleanup: unknown type"); } /* * Hopefully the syncer daemon will catch up and awaken us. * We wait at most tickdelay before proceeding in any case. */ ACQUIRE_GBLLOCK(&lk); FREE_LOCK(ump); proc_waiting += 1; if (callout_pending(&softdep_callout) == FALSE) callout_reset(&softdep_callout, tickdelay > 2 ? tickdelay : 2, pause_timer, 0); if ((td->td_pflags & TDP_KTHREAD) == 0) msleep((caddr_t)&proc_waiting, &lk, PPAUSE, "softupdate", 0); proc_waiting -= 1; FREE_GBLLOCK(&lk); ACQUIRE_LOCK(ump); return (1); } /* * Awaken processes pausing in request_cleanup and clear proc_waiting * to indicate that there is no longer a timer running. Pause_timer * will be called with the global softdep mutex (&lk) locked. */ static void pause_timer(arg) void *arg; { GBLLOCK_OWNED(&lk); /* * The callout_ API has acquired mtx and will hold it around this * function call. */ *stat_countp += proc_waiting; wakeup(&proc_waiting); } /* * If requested, try removing inode or removal dependencies. */ static void check_clear_deps(mp) struct mount *mp; { struct ufsmount *ump; bool suj_susp; /* * Tell the lower layers that any TRIM or WRITE transactions that have * been delayed for performance reasons should proceed to help alleviate * the shortage faster. The race between checking req_* and the softdep * mutex (lk) is fine since this is an advisory operation that at most * causes deferred work to be done sooner. */ ump = VFSTOUFS(mp); suj_susp = MOUNTEDSUJ(mp) && ump->softdep_jblocks->jb_suspended; if (req_clear_remove || req_clear_inodedeps || suj_susp) { FREE_LOCK(ump); softdep_send_speedup(ump, 0, BIO_SPEEDUP_TRIM | BIO_SPEEDUP_WRITE); ACQUIRE_LOCK(ump); } /* * If we are suspended, it may be because of our using * too many inodedeps, so help clear them out. */ if (suj_susp) clear_inodedeps(mp); /* * General requests for cleanup of backed up dependencies */ ACQUIRE_GBLLOCK(&lk); if (req_clear_inodedeps) { req_clear_inodedeps -= 1; FREE_GBLLOCK(&lk); clear_inodedeps(mp); ACQUIRE_GBLLOCK(&lk); wakeup(&proc_waiting); } if (req_clear_remove) { req_clear_remove -= 1; FREE_GBLLOCK(&lk); clear_remove(mp); ACQUIRE_GBLLOCK(&lk); wakeup(&proc_waiting); } FREE_GBLLOCK(&lk); } /* * Flush out a directory with at least one removal dependency in an effort to * reduce the number of dirrem, freefile, and freeblks dependency structures. */ static void clear_remove(mp) struct mount *mp; { struct pagedep_hashhead *pagedephd; struct pagedep *pagedep; struct ufsmount *ump; struct vnode *vp; struct bufobj *bo; int error, cnt; ino_t ino; ump = VFSTOUFS(mp); LOCK_OWNED(ump); for (cnt = 0; cnt <= ump->pagedep_hash_size; cnt++) { pagedephd = &ump->pagedep_hashtbl[ump->pagedep_nextclean++]; if (ump->pagedep_nextclean > ump->pagedep_hash_size) ump->pagedep_nextclean = 0; LIST_FOREACH(pagedep, pagedephd, pd_hash) { if (LIST_EMPTY(&pagedep->pd_dirremhd)) continue; ino = pagedep->pd_ino; if (vn_start_write(NULL, &mp, V_NOWAIT) != 0) continue; FREE_LOCK(ump); /* * Let unmount clear deps */ error = vfs_busy(mp, MBF_NOWAIT); if (error != 0) goto finish_write; error = ffs_vgetf(mp, ino, LK_EXCLUSIVE, &vp, FFSV_FORCEINSMQ); vfs_unbusy(mp); if (error != 0) { softdep_error("clear_remove: vget", error); goto finish_write; } MPASS(VTOI(vp)->i_mode != 0); if ((error = ffs_syncvnode(vp, MNT_NOWAIT, 0))) softdep_error("clear_remove: fsync", error); bo = &vp->v_bufobj; BO_LOCK(bo); drain_output(vp); BO_UNLOCK(bo); vput(vp); finish_write: vn_finished_write(mp); ACQUIRE_LOCK(ump); return; } } } /* * Clear out a block of dirty inodes in an effort to reduce * the number of inodedep dependency structures. */ static void clear_inodedeps(mp) struct mount *mp; { struct inodedep_hashhead *inodedephd; struct inodedep *inodedep; struct ufsmount *ump; struct vnode *vp; struct fs *fs; int error, cnt; ino_t firstino, lastino, ino; ump = VFSTOUFS(mp); fs = ump->um_fs; LOCK_OWNED(ump); /* * Pick a random inode dependency to be cleared. * We will then gather up all the inodes in its block * that have dependencies and flush them out. */ for (cnt = 0; cnt <= ump->inodedep_hash_size; cnt++) { inodedephd = &ump->inodedep_hashtbl[ump->inodedep_nextclean++]; if (ump->inodedep_nextclean > ump->inodedep_hash_size) ump->inodedep_nextclean = 0; if ((inodedep = LIST_FIRST(inodedephd)) != NULL) break; } if (inodedep == NULL) return; /* * Find the last inode in the block with dependencies. */ firstino = rounddown2(inodedep->id_ino, INOPB(fs)); for (lastino = firstino + INOPB(fs) - 1; lastino > firstino; lastino--) if (inodedep_lookup(mp, lastino, 0, &inodedep) != 0) break; /* * Asynchronously push all but the last inode with dependencies. * Synchronously push the last inode with dependencies to ensure * that the inode block gets written to free up the inodedeps. */ for (ino = firstino; ino <= lastino; ino++) { if (inodedep_lookup(mp, ino, 0, &inodedep) == 0) continue; if (vn_start_write(NULL, &mp, V_NOWAIT) != 0) continue; FREE_LOCK(ump); error = vfs_busy(mp, MBF_NOWAIT); /* Let unmount clear deps */ if (error != 0) { vn_finished_write(mp); ACQUIRE_LOCK(ump); return; } if ((error = ffs_vgetf(mp, ino, LK_EXCLUSIVE, &vp, FFSV_FORCEINSMQ)) != 0) { softdep_error("clear_inodedeps: vget", error); vfs_unbusy(mp); vn_finished_write(mp); ACQUIRE_LOCK(ump); return; } vfs_unbusy(mp); if (VTOI(vp)->i_mode == 0) { vgone(vp); } else if (ino == lastino) { if ((error = ffs_syncvnode(vp, MNT_WAIT, 0))) softdep_error("clear_inodedeps: fsync1", error); } else { if ((error = ffs_syncvnode(vp, MNT_NOWAIT, 0))) softdep_error("clear_inodedeps: fsync2", error); BO_LOCK(&vp->v_bufobj); drain_output(vp); BO_UNLOCK(&vp->v_bufobj); } vput(vp); vn_finished_write(mp); ACQUIRE_LOCK(ump); } } void softdep_buf_append(bp, wkhd) struct buf *bp; struct workhead *wkhd; { struct worklist *wk; struct ufsmount *ump; if ((wk = LIST_FIRST(wkhd)) == NULL) return; KASSERT(MOUNTEDSOFTDEP(wk->wk_mp) != 0, ("softdep_buf_append called on non-softdep filesystem")); ump = VFSTOUFS(wk->wk_mp); ACQUIRE_LOCK(ump); while ((wk = LIST_FIRST(wkhd)) != NULL) { WORKLIST_REMOVE(wk); WORKLIST_INSERT(&bp->b_dep, wk); } FREE_LOCK(ump); } void softdep_inode_append(ip, cred, wkhd) struct inode *ip; struct ucred *cred; struct workhead *wkhd; { struct buf *bp; struct fs *fs; struct ufsmount *ump; int error; ump = ITOUMP(ip); KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(ump)) != 0, ("softdep_inode_append called on non-softdep filesystem")); fs = ump->um_fs; error = bread(ump->um_devvp, fsbtodb(fs, ino_to_fsba(fs, ip->i_number)), (int)fs->fs_bsize, cred, &bp); if (error) { bqrelse(bp); softdep_freework(wkhd); return; } softdep_buf_append(bp, wkhd); bqrelse(bp); } void softdep_freework(wkhd) struct workhead *wkhd; { struct worklist *wk; struct ufsmount *ump; if ((wk = LIST_FIRST(wkhd)) == NULL) return; KASSERT(MOUNTEDSOFTDEP(wk->wk_mp) != 0, ("softdep_freework called on non-softdep filesystem")); ump = VFSTOUFS(wk->wk_mp); ACQUIRE_LOCK(ump); handle_jwork(wkhd); FREE_LOCK(ump); } static struct ufsmount * softdep_bp_to_mp(bp) struct buf *bp; { struct mount *mp; struct vnode *vp; if (LIST_EMPTY(&bp->b_dep)) return (NULL); vp = bp->b_vp; KASSERT(vp != NULL, ("%s, buffer with dependencies lacks vnode", __func__)); /* * The ump mount point is stable after we get a correct * pointer, since bp is locked and this prevents unmount from * proceeding. But to get to it, we cannot dereference bp->b_dep * head wk_mp, because we do not yet own SU ump lock and * workitem might be freed while dereferenced. */ retry: switch (vp->v_type) { case VCHR: VI_LOCK(vp); mp = vp->v_type == VCHR ? vp->v_rdev->si_mountpt : NULL; VI_UNLOCK(vp); if (mp == NULL) goto retry; break; case VREG: case VDIR: case VLNK: case VFIFO: case VSOCK: mp = vp->v_mount; break; case VBLK: vn_printf(vp, "softdep_bp_to_mp: unexpected block device\n"); /* FALLTHROUGH */ case VNON: case VBAD: case VMARKER: mp = NULL; break; default: vn_printf(vp, "unknown vnode type"); mp = NULL; break; } return (VFSTOUFS(mp)); } /* * Function to determine if the buffer has outstanding dependencies * that will cause a roll-back if the buffer is written. If wantcount * is set, return number of dependencies, otherwise just yes or no. */ static int softdep_count_dependencies(bp, wantcount) struct buf *bp; int wantcount; { struct worklist *wk; struct ufsmount *ump; struct bmsafemap *bmsafemap; struct freework *freework; struct inodedep *inodedep; struct indirdep *indirdep; struct freeblks *freeblks; struct allocindir *aip; struct pagedep *pagedep; struct dirrem *dirrem; struct newblk *newblk; struct mkdir *mkdir; struct diradd *dap; int i, retval; ump = softdep_bp_to_mp(bp); if (ump == NULL) return (0); retval = 0; ACQUIRE_LOCK(ump); LIST_FOREACH(wk, &bp->b_dep, wk_list) { switch (wk->wk_type) { case D_INODEDEP: inodedep = WK_INODEDEP(wk); if ((inodedep->id_state & DEPCOMPLETE) == 0) { /* bitmap allocation dependency */ retval += 1; if (!wantcount) goto out; } if (TAILQ_FIRST(&inodedep->id_inoupdt)) { /* direct block pointer dependency */ retval += 1; if (!wantcount) goto out; } if (TAILQ_FIRST(&inodedep->id_extupdt)) { /* direct block pointer dependency */ retval += 1; if (!wantcount) goto out; } if (TAILQ_FIRST(&inodedep->id_inoreflst)) { /* Add reference dependency. */ retval += 1; if (!wantcount) goto out; } continue; case D_INDIRDEP: indirdep = WK_INDIRDEP(wk); TAILQ_FOREACH(freework, &indirdep->ir_trunc, fw_next) { /* indirect truncation dependency */ retval += 1; if (!wantcount) goto out; } LIST_FOREACH(aip, &indirdep->ir_deplisthd, ai_next) { /* indirect block pointer dependency */ retval += 1; if (!wantcount) goto out; } continue; case D_PAGEDEP: pagedep = WK_PAGEDEP(wk); LIST_FOREACH(dirrem, &pagedep->pd_dirremhd, dm_next) { if (LIST_FIRST(&dirrem->dm_jremrefhd)) { /* Journal remove ref dependency. */ retval += 1; if (!wantcount) goto out; } } for (i = 0; i < DAHASHSZ; i++) { LIST_FOREACH(dap, &pagedep->pd_diraddhd[i], da_pdlist) { /* directory entry dependency */ retval += 1; if (!wantcount) goto out; } } continue; case D_BMSAFEMAP: bmsafemap = WK_BMSAFEMAP(wk); if (LIST_FIRST(&bmsafemap->sm_jaddrefhd)) { /* Add reference dependency. */ retval += 1; if (!wantcount) goto out; } if (LIST_FIRST(&bmsafemap->sm_jnewblkhd)) { /* Allocate block dependency. */ retval += 1; if (!wantcount) goto out; } continue; case D_FREEBLKS: freeblks = WK_FREEBLKS(wk); if (LIST_FIRST(&freeblks->fb_jblkdephd)) { /* Freeblk journal dependency. */ retval += 1; if (!wantcount) goto out; } continue; case D_ALLOCDIRECT: case D_ALLOCINDIR: newblk = WK_NEWBLK(wk); if (newblk->nb_jnewblk) { /* Journal allocate dependency. */ retval += 1; if (!wantcount) goto out; } continue; case D_MKDIR: mkdir = WK_MKDIR(wk); if (mkdir->md_jaddref) { /* Journal reference dependency. */ retval += 1; if (!wantcount) goto out; } continue; case D_FREEWORK: case D_FREEDEP: case D_JSEGDEP: case D_JSEG: case D_SBDEP: /* never a dependency on these blocks */ continue; default: panic("softdep_count_dependencies: Unexpected type %s", TYPENAME(wk->wk_type)); /* NOTREACHED */ } } out: FREE_LOCK(ump); return (retval); } /* * Acquire exclusive access to a buffer. * Must be called with a locked mtx parameter. * Return acquired buffer or NULL on failure. */ static struct buf * getdirtybuf(bp, lock, waitfor) struct buf *bp; struct rwlock *lock; int waitfor; { int error; if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0) { if (waitfor != MNT_WAIT) return (NULL); error = BUF_LOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, lock); /* * Even if we successfully acquire bp here, we have dropped * lock, which may violates our guarantee. */ if (error == 0) BUF_UNLOCK(bp); else if (error != ENOLCK) panic("getdirtybuf: inconsistent lock: %d", error); rw_wlock(lock); return (NULL); } if ((bp->b_vflags & BV_BKGRDINPROG) != 0) { if (lock != BO_LOCKPTR(bp->b_bufobj) && waitfor == MNT_WAIT) { rw_wunlock(lock); BO_LOCK(bp->b_bufobj); BUF_UNLOCK(bp); if ((bp->b_vflags & BV_BKGRDINPROG) != 0) { bp->b_vflags |= BV_BKGRDWAIT; msleep(&bp->b_xflags, BO_LOCKPTR(bp->b_bufobj), PRIBIO | PDROP, "getbuf", 0); } else BO_UNLOCK(bp->b_bufobj); rw_wlock(lock); return (NULL); } BUF_UNLOCK(bp); if (waitfor != MNT_WAIT) return (NULL); #ifdef DEBUG_VFS_LOCKS if (bp->b_vp->v_type != VCHR) ASSERT_BO_WLOCKED(bp->b_bufobj); #endif bp->b_vflags |= BV_BKGRDWAIT; rw_sleep(&bp->b_xflags, lock, PRIBIO, "getbuf", 0); return (NULL); } if ((bp->b_flags & B_DELWRI) == 0) { BUF_UNLOCK(bp); return (NULL); } bremfree(bp); return (bp); } /* * Check if it is safe to suspend the file system now. On entry, * the vnode interlock for devvp should be held. Return 0 with * the mount interlock held if the file system can be suspended now, * otherwise return EAGAIN with the mount interlock held. */ int softdep_check_suspend(struct mount *mp, struct vnode *devvp, int softdep_depcnt, int softdep_accdepcnt, int secondary_writes, int secondary_accwrites) { struct bufobj *bo; struct ufsmount *ump; struct inodedep *inodedep; int error, unlinked; bo = &devvp->v_bufobj; ASSERT_BO_WLOCKED(bo); /* * If we are not running with soft updates, then we need only * deal with secondary writes as we try to suspend. */ if (MOUNTEDSOFTDEP(mp) == 0) { MNT_ILOCK(mp); while (mp->mnt_secondary_writes != 0) { BO_UNLOCK(bo); msleep(&mp->mnt_secondary_writes, MNT_MTX(mp), (PUSER - 1) | PDROP, "secwr", 0); BO_LOCK(bo); MNT_ILOCK(mp); } /* * Reasons for needing more work before suspend: * - Dirty buffers on devvp. * - Secondary writes occurred after start of vnode sync loop */ error = 0; if (bo->bo_numoutput > 0 || bo->bo_dirty.bv_cnt > 0 || secondary_writes != 0 || mp->mnt_secondary_writes != 0 || secondary_accwrites != mp->mnt_secondary_accwrites) error = EAGAIN; BO_UNLOCK(bo); return (error); } /* * If we are running with soft updates, then we need to coordinate * with them as we try to suspend. */ ump = VFSTOUFS(mp); for (;;) { if (!TRY_ACQUIRE_LOCK(ump)) { BO_UNLOCK(bo); ACQUIRE_LOCK(ump); FREE_LOCK(ump); BO_LOCK(bo); continue; } MNT_ILOCK(mp); if (mp->mnt_secondary_writes != 0) { FREE_LOCK(ump); BO_UNLOCK(bo); msleep(&mp->mnt_secondary_writes, MNT_MTX(mp), (PUSER - 1) | PDROP, "secwr", 0); BO_LOCK(bo); continue; } break; } unlinked = 0; if (MOUNTEDSUJ(mp)) { for (inodedep = TAILQ_FIRST(&ump->softdep_unlinked); inodedep != NULL; inodedep = TAILQ_NEXT(inodedep, id_unlinked)) { if ((inodedep->id_state & (UNLINKED | UNLINKLINKS | UNLINKONLIST)) != (UNLINKED | UNLINKLINKS | UNLINKONLIST) || !check_inodedep_free(inodedep)) continue; unlinked++; } } /* * Reasons for needing more work before suspend: * - Dirty buffers on devvp. * - Softdep activity occurred after start of vnode sync loop * - Secondary writes occurred after start of vnode sync loop */ error = 0; if (bo->bo_numoutput > 0 || bo->bo_dirty.bv_cnt > 0 || softdep_depcnt != unlinked || ump->softdep_deps != unlinked || softdep_accdepcnt != ump->softdep_accdeps || secondary_writes != 0 || mp->mnt_secondary_writes != 0 || secondary_accwrites != mp->mnt_secondary_accwrites) error = EAGAIN; FREE_LOCK(ump); BO_UNLOCK(bo); return (error); } /* * Get the number of dependency structures for the file system, both * the current number and the total number allocated. These will * later be used to detect that softdep processing has occurred. */ void softdep_get_depcounts(struct mount *mp, int *softdep_depsp, int *softdep_accdepsp) { struct ufsmount *ump; if (MOUNTEDSOFTDEP(mp) == 0) { *softdep_depsp = 0; *softdep_accdepsp = 0; return; } ump = VFSTOUFS(mp); ACQUIRE_LOCK(ump); *softdep_depsp = ump->softdep_deps; *softdep_accdepsp = ump->softdep_accdeps; FREE_LOCK(ump); } /* * Wait for pending output on a vnode to complete. */ static void drain_output(vp) struct vnode *vp; { ASSERT_VOP_LOCKED(vp, "drain_output"); (void)bufobj_wwait(&vp->v_bufobj, 0, 0); } /* * Called whenever a buffer that is being invalidated or reallocated * contains dependencies. This should only happen if an I/O error has * occurred. The routine is called with the buffer locked. */ static void softdep_deallocate_dependencies(bp) struct buf *bp; { if ((bp->b_ioflags & BIO_ERROR) == 0) panic("softdep_deallocate_dependencies: dangling deps"); if (bp->b_vp != NULL && bp->b_vp->v_mount != NULL) softdep_error(bp->b_vp->v_mount->mnt_stat.f_mntonname, bp->b_error); else printf("softdep_deallocate_dependencies: " "got error %d while accessing filesystem\n", bp->b_error); if (bp->b_error != ENXIO) panic("softdep_deallocate_dependencies: unrecovered I/O error"); } /* * Function to handle asynchronous write errors in the filesystem. */ static void softdep_error(func, error) char *func; int error; { /* XXX should do something better! */ printf("%s: got error %d while accessing filesystem\n", func, error); } #ifdef DDB /* exported to ffs_vfsops.c */ extern void db_print_ffs(struct ufsmount *ump); void db_print_ffs(struct ufsmount *ump) { db_printf("mp %p (%s) devvp %p\n", ump->um_mountp, ump->um_mountp->mnt_stat.f_mntonname, ump->um_devvp); db_printf(" fs %p su_wl %d su_deps %d su_req %d\n", ump->um_fs, ump->softdep_on_worklist, ump->softdep_deps, ump->softdep_req); } static void worklist_print(struct worklist *wk, int verbose) { if (!verbose) { db_printf("%s: %p state 0x%b\n", TYPENAME(wk->wk_type), wk, (u_int)wk->wk_state, PRINT_SOFTDEP_FLAGS); return; } db_printf("worklist: %p type %s state 0x%b next %p\n ", wk, TYPENAME(wk->wk_type), (u_int)wk->wk_state, PRINT_SOFTDEP_FLAGS, LIST_NEXT(wk, wk_list)); db_print_ffs(VFSTOUFS(wk->wk_mp)); } static void inodedep_print(struct inodedep *inodedep, int verbose) { worklist_print(&inodedep->id_list, 0); db_printf(" fs %p ino %jd inoblk %jd delta %jd nlink %jd\n", inodedep->id_fs, (intmax_t)inodedep->id_ino, (intmax_t)fsbtodb(inodedep->id_fs, ino_to_fsba(inodedep->id_fs, inodedep->id_ino)), (intmax_t)inodedep->id_nlinkdelta, (intmax_t)inodedep->id_savednlink); if (verbose == 0) return; db_printf(" bmsafemap %p, mkdiradd %p, inoreflst %p\n", inodedep->id_bmsafemap, inodedep->id_mkdiradd, TAILQ_FIRST(&inodedep->id_inoreflst)); db_printf(" dirremhd %p, pendinghd %p, bufwait %p\n", LIST_FIRST(&inodedep->id_dirremhd), LIST_FIRST(&inodedep->id_pendinghd), LIST_FIRST(&inodedep->id_bufwait)); db_printf(" inowait %p, inoupdt %p, newinoupdt %p\n", LIST_FIRST(&inodedep->id_inowait), TAILQ_FIRST(&inodedep->id_inoupdt), TAILQ_FIRST(&inodedep->id_newinoupdt)); db_printf(" extupdt %p, newextupdt %p, freeblklst %p\n", TAILQ_FIRST(&inodedep->id_extupdt), TAILQ_FIRST(&inodedep->id_newextupdt), TAILQ_FIRST(&inodedep->id_freeblklst)); db_printf(" saveino %p, savedsize %jd, savedextsize %jd\n", inodedep->id_savedino1, (intmax_t)inodedep->id_savedsize, (intmax_t)inodedep->id_savedextsize); } static void newblk_print(struct newblk *nbp) { worklist_print(&nbp->nb_list, 0); db_printf(" newblkno %jd\n", (intmax_t)nbp->nb_newblkno); db_printf(" jnewblk %p, bmsafemap %p, freefrag %p\n", &nbp->nb_jnewblk, &nbp->nb_bmsafemap, &nbp->nb_freefrag); db_printf(" indirdeps %p, newdirblk %p, jwork %p\n", LIST_FIRST(&nbp->nb_indirdeps), LIST_FIRST(&nbp->nb_newdirblk), LIST_FIRST(&nbp->nb_jwork)); } static void allocdirect_print(struct allocdirect *adp) { newblk_print(&adp->ad_block); db_printf(" oldblkno %jd, oldsize %ld, newsize %ld\n", adp->ad_oldblkno, adp->ad_oldsize, adp->ad_newsize); db_printf(" offset %d, inodedep %p\n", adp->ad_offset, adp->ad_inodedep); } static void allocindir_print(struct allocindir *aip) { newblk_print(&aip->ai_block); db_printf(" oldblkno %jd, lbn %jd\n", (intmax_t)aip->ai_oldblkno, (intmax_t)aip->ai_lbn); db_printf(" offset %d, indirdep %p\n", aip->ai_offset, aip->ai_indirdep); } static void mkdir_print(struct mkdir *mkdir) { worklist_print(&mkdir->md_list, 0); db_printf(" diradd %p, jaddref %p, buf %p\n", mkdir->md_diradd, mkdir->md_jaddref, mkdir->md_buf); } DB_SHOW_COMMAND(sd_inodedep, db_show_sd_inodedep) { if (have_addr == 0) { db_printf("inodedep address required\n"); return; } inodedep_print((struct inodedep*)addr, 1); } DB_SHOW_COMMAND(sd_allinodedeps, db_show_sd_allinodedeps) { struct inodedep_hashhead *inodedephd; struct inodedep *inodedep; struct ufsmount *ump; int cnt; if (have_addr == 0) { db_printf("ufsmount address required\n"); return; } ump = (struct ufsmount *)addr; for (cnt = 0; cnt < ump->inodedep_hash_size; cnt++) { inodedephd = &ump->inodedep_hashtbl[cnt]; LIST_FOREACH(inodedep, inodedephd, id_hash) { inodedep_print(inodedep, 0); } } } DB_SHOW_COMMAND(sd_worklist, db_show_sd_worklist) { if (have_addr == 0) { db_printf("worklist address required\n"); return; } worklist_print((struct worklist *)addr, 1); } DB_SHOW_COMMAND(sd_workhead, db_show_sd_workhead) { struct worklist *wk; struct workhead *wkhd; if (have_addr == 0) { db_printf("worklist address required " "(for example value in bp->b_dep)\n"); return; } /* * We often do not have the address of the worklist head but * instead a pointer to its first entry (e.g., we have the * contents of bp->b_dep rather than &bp->b_dep). But the back * pointer of bp->b_dep will point at the head of the list, so * we cheat and use that instead. If we are in the middle of * a list we will still get the same result, so nothing * unexpected will result. */ wk = (struct worklist *)addr; if (wk == NULL) return; wkhd = (struct workhead *)wk->wk_list.le_prev; LIST_FOREACH(wk, wkhd, wk_list) { switch(wk->wk_type) { case D_INODEDEP: inodedep_print(WK_INODEDEP(wk), 0); continue; case D_ALLOCDIRECT: allocdirect_print(WK_ALLOCDIRECT(wk)); continue; case D_ALLOCINDIR: allocindir_print(WK_ALLOCINDIR(wk)); continue; case D_MKDIR: mkdir_print(WK_MKDIR(wk)); continue; default: worklist_print(wk, 0); continue; } } } DB_SHOW_COMMAND(sd_mkdir, db_show_sd_mkdir) { if (have_addr == 0) { db_printf("mkdir address required\n"); return; } mkdir_print((struct mkdir *)addr); } DB_SHOW_COMMAND(sd_mkdir_list, db_show_sd_mkdir_list) { struct mkdirlist *mkdirlisthd; struct mkdir *mkdir; if (have_addr == 0) { db_printf("mkdir listhead address required\n"); return; } mkdirlisthd = (struct mkdirlist *)addr; LIST_FOREACH(mkdir, mkdirlisthd, md_mkdirs) { mkdir_print(mkdir); if (mkdir->md_diradd != NULL) { db_printf(" "); worklist_print(&mkdir->md_diradd->da_list, 0); } if (mkdir->md_jaddref != NULL) { db_printf(" "); worklist_print(&mkdir->md_jaddref->ja_list, 0); } } } DB_SHOW_COMMAND(sd_allocdirect, db_show_sd_allocdirect) { if (have_addr == 0) { db_printf("allocdirect address required\n"); return; } allocdirect_print((struct allocdirect *)addr); } DB_SHOW_COMMAND(sd_allocindir, db_show_sd_allocindir) { if (have_addr == 0) { db_printf("allocindir address required\n"); return; } allocindir_print((struct allocindir *)addr); } #endif /* DDB */ #endif /* SOFTUPDATES */ Index: head/sys/ufs/ffs/ffs_vfsops.c =================================================================== --- head/sys/ufs/ffs/ffs_vfsops.c (revision 364371) +++ head/sys/ufs/ffs/ffs_vfsops.c (revision 364372) @@ -1,2683 +1,2683 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1989, 1991, 1993, 1994 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ffs_vfsops.c 8.31 (Berkeley) 5/20/95 */ #include __FBSDID("$FreeBSD$"); #include "opt_quota.h" #include "opt_ufs.h" #include "opt_ffs.h" #include "opt_ddb.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static uma_zone_t uma_inode, uma_ufs1, uma_ufs2; VFS_SMR_DECLARE; static int ffs_mountfs(struct vnode *, struct mount *, struct thread *); static void ffs_oldfscompat_read(struct fs *, struct ufsmount *, ufs2_daddr_t); static void ffs_ifree(struct ufsmount *ump, struct inode *ip); static int ffs_sync_lazy(struct mount *mp); static int ffs_use_bread(void *devfd, off_t loc, void **bufp, int size); static int ffs_use_bwrite(void *devfd, off_t loc, void *buf, int size); static vfs_init_t ffs_init; static vfs_uninit_t ffs_uninit; static vfs_extattrctl_t ffs_extattrctl; static vfs_cmount_t ffs_cmount; static vfs_unmount_t ffs_unmount; static vfs_mount_t ffs_mount; static vfs_statfs_t ffs_statfs; static vfs_fhtovp_t ffs_fhtovp; static vfs_sync_t ffs_sync; static struct vfsops ufs_vfsops = { .vfs_extattrctl = ffs_extattrctl, .vfs_fhtovp = ffs_fhtovp, .vfs_init = ffs_init, .vfs_mount = ffs_mount, .vfs_cmount = ffs_cmount, .vfs_quotactl = ufs_quotactl, .vfs_root = vfs_cache_root, .vfs_cachedroot = ufs_root, .vfs_statfs = ffs_statfs, .vfs_sync = ffs_sync, .vfs_uninit = ffs_uninit, .vfs_unmount = ffs_unmount, .vfs_vget = ffs_vget, .vfs_susp_clean = process_deferred_inactive, }; VFS_SET(ufs_vfsops, ufs, 0); MODULE_VERSION(ufs, 1); static b_strategy_t ffs_geom_strategy; static b_write_t ffs_bufwrite; static struct buf_ops ffs_ops = { .bop_name = "FFS", .bop_write = ffs_bufwrite, .bop_strategy = ffs_geom_strategy, .bop_sync = bufsync, #ifdef NO_FFS_SNAPSHOT .bop_bdflush = bufbdflush, #else .bop_bdflush = ffs_bdflush, #endif }; /* * Note that userquota and groupquota options are not currently used * by UFS/FFS code and generally mount(8) does not pass those options * from userland, but they can be passed by loader(8) via * vfs.root.mountfrom.options. */ static const char *ffs_opts[] = { "acls", "async", "noatime", "noclusterr", "noclusterw", "noexec", "export", "force", "from", "groupquota", "multilabel", "nfsv4acls", "fsckpid", "snapshot", "nosuid", "suiddir", "nosymfollow", "sync", "union", "userquota", "untrusted", NULL }; static int ffs_enxio_enable = 1; SYSCTL_DECL(_vfs_ffs); SYSCTL_INT(_vfs_ffs, OID_AUTO, enxio_enable, CTLFLAG_RWTUN, &ffs_enxio_enable, 0, "enable mapping of other disk I/O errors to ENXIO"); /* * Return buffer with the contents of block "offset" from the beginning of * directory "ip". If "res" is non-zero, fill it in with a pointer to the * remaining space in the directory. */ static int ffs_blkatoff(struct vnode *vp, off_t offset, char **res, struct buf **bpp) { struct inode *ip; struct fs *fs; struct buf *bp; ufs_lbn_t lbn; int bsize, error; ip = VTOI(vp); fs = ITOFS(ip); lbn = lblkno(fs, offset); bsize = blksize(fs, ip, lbn); *bpp = NULL; error = bread(vp, lbn, bsize, NOCRED, &bp); if (error) { return (error); } if (res) *res = (char *)bp->b_data + blkoff(fs, offset); *bpp = bp; return (0); } /* * Load up the contents of an inode and copy the appropriate pieces * to the incore copy. */ static int ffs_load_inode(struct buf *bp, struct inode *ip, struct fs *fs, ino_t ino) { struct ufs1_dinode *dip1; struct ufs2_dinode *dip2; int error; if (I_IS_UFS1(ip)) { dip1 = ip->i_din1; *dip1 = *((struct ufs1_dinode *)bp->b_data + ino_to_fsbo(fs, ino)); ip->i_mode = dip1->di_mode; ip->i_nlink = dip1->di_nlink; ip->i_effnlink = dip1->di_nlink; ip->i_size = dip1->di_size; ip->i_flags = dip1->di_flags; ip->i_gen = dip1->di_gen; ip->i_uid = dip1->di_uid; ip->i_gid = dip1->di_gid; return (0); } dip2 = ((struct ufs2_dinode *)bp->b_data + ino_to_fsbo(fs, ino)); if ((error = ffs_verify_dinode_ckhash(fs, dip2)) != 0 && !ffs_fsfail_cleanup(ITOUMP(ip), error)) { printf("%s: inode %jd: check-hash failed\n", fs->fs_fsmnt, (intmax_t)ino); return (error); } *ip->i_din2 = *dip2; dip2 = ip->i_din2; ip->i_mode = dip2->di_mode; ip->i_nlink = dip2->di_nlink; ip->i_effnlink = dip2->di_nlink; ip->i_size = dip2->di_size; ip->i_flags = dip2->di_flags; ip->i_gen = dip2->di_gen; ip->i_uid = dip2->di_uid; ip->i_gid = dip2->di_gid; return (0); } /* * Verify that a filesystem block number is a valid data block. * This routine is only called on untrusted filesystems. */ static int ffs_check_blkno(struct mount *mp, ino_t inum, ufs2_daddr_t daddr, int blksize) { struct fs *fs; struct ufsmount *ump; ufs2_daddr_t end_daddr; int cg, havemtx; KASSERT((mp->mnt_flag & MNT_UNTRUSTED) != 0, ("ffs_check_blkno called on a trusted file system")); ump = VFSTOUFS(mp); fs = ump->um_fs; cg = dtog(fs, daddr); end_daddr = daddr + numfrags(fs, blksize); /* * Verify that the block number is a valid data block. Also check * that it does not point to an inode block or a superblock. Accept * blocks that are unalloacted (0) or part of snapshot metadata * (BLK_NOCOPY or BLK_SNAP). * * Thus, the block must be in a valid range for the filesystem and * either in the space before a backup superblock (except the first * cylinder group where that space is used by the bootstrap code) or * after the inode blocks and before the end of the cylinder group. */ if ((uint64_t)daddr <= BLK_SNAP || ((uint64_t)end_daddr <= fs->fs_size && ((cg > 0 && end_daddr <= cgsblock(fs, cg)) || (daddr >= cgdmin(fs, cg) && end_daddr <= cgbase(fs, cg) + fs->fs_fpg)))) return (0); if ((havemtx = mtx_owned(UFS_MTX(ump))) == 0) UFS_LOCK(ump); if (ppsratecheck(&ump->um_last_integritymsg, &ump->um_secs_integritymsg, 1)) { UFS_UNLOCK(ump); uprintf("\n%s: inode %jd, out-of-range indirect block " "number %jd\n", mp->mnt_stat.f_mntonname, inum, daddr); if (havemtx) UFS_LOCK(ump); } else if (!havemtx) UFS_UNLOCK(ump); return (EINTEGRITY); } /* * Initiate a forcible unmount. * Used to unmount filesystems whose underlying media has gone away. */ static void ffs_fsfail_unmount(void *v, int pending) { struct fsfail_task *etp; struct mount *mp; etp = v; /* * Find our mount and get a ref on it, then try to unmount. */ mp = vfs_getvfs(&etp->fsid); if (mp != NULL) dounmount(mp, MNT_FORCE, curthread); free(etp, M_UFSMNT); } /* * On first ENXIO error, start a task that forcibly unmounts the filesystem. * * Return true if a cleanup is in progress. */ int ffs_fsfail_cleanup(struct ufsmount *ump, int error) { int retval; UFS_LOCK(ump); retval = ffs_fsfail_cleanup_locked(ump, error); UFS_UNLOCK(ump); return (retval); } int ffs_fsfail_cleanup_locked(struct ufsmount *ump, int error) { struct fsfail_task *etp; struct task *tp; mtx_assert(UFS_MTX(ump), MA_OWNED); if (error == ENXIO && (ump->um_flags & UM_FSFAIL_CLEANUP) == 0) { ump->um_flags |= UM_FSFAIL_CLEANUP; /* * Queue an async forced unmount. */ etp = ump->um_fsfail_task; ump->um_fsfail_task = NULL; if (etp != NULL) { tp = &etp->task; TASK_INIT(tp, 0, ffs_fsfail_unmount, etp); taskqueue_enqueue(taskqueue_thread, tp); printf("UFS: forcibly unmounting %s from %s\n", ump->um_mountp->mnt_stat.f_mntfromname, ump->um_mountp->mnt_stat.f_mntonname); } } return ((ump->um_flags & UM_FSFAIL_CLEANUP) != 0); } /* * Wrapper used during ENXIO cleanup to allocate empty buffers when * the kernel is unable to read the real one. They are needed so that * the soft updates code can use them to unwind its dependencies. */ int ffs_breadz(struct ufsmount *ump, struct vnode *vp, daddr_t lblkno, daddr_t dblkno, int size, daddr_t *rablkno, int *rabsize, int cnt, struct ucred *cred, int flags, void (*ckhashfunc)(struct buf *), struct buf **bpp) { int error; flags |= GB_CVTENXIO; error = breadn_flags(vp, lblkno, dblkno, size, rablkno, rabsize, cnt, cred, flags, ckhashfunc, bpp); if (error != 0 && ffs_fsfail_cleanup(ump, error)) { error = getblkx(vp, lblkno, dblkno, size, 0, 0, flags, bpp); KASSERT(error == 0, ("getblkx failed")); vfs_bio_bzero_buf(*bpp, 0, size); } return (error); } static int ffs_mount(struct mount *mp) { struct vnode *devvp, *odevvp; struct thread *td; struct ufsmount *ump = NULL; struct fs *fs; pid_t fsckpid = 0; int error, error1, flags; uint64_t mntorflags, saved_mnt_flag; accmode_t accmode; struct nameidata ndp; char *fspec; td = curthread; if (vfs_filteropt(mp->mnt_optnew, ffs_opts)) return (EINVAL); if (uma_inode == NULL) { uma_inode = uma_zcreate("FFS inode", sizeof(struct inode), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); uma_ufs1 = uma_zcreate("FFS1 dinode", sizeof(struct ufs1_dinode), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); uma_ufs2 = uma_zcreate("FFS2 dinode", sizeof(struct ufs2_dinode), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); VFS_SMR_ZONE_SET(uma_inode); } vfs_deleteopt(mp->mnt_optnew, "groupquota"); vfs_deleteopt(mp->mnt_optnew, "userquota"); fspec = vfs_getopts(mp->mnt_optnew, "from", &error); if (error) return (error); mntorflags = 0; if (vfs_getopt(mp->mnt_optnew, "untrusted", NULL, NULL) == 0) mntorflags |= MNT_UNTRUSTED; if (vfs_getopt(mp->mnt_optnew, "acls", NULL, NULL) == 0) mntorflags |= MNT_ACLS; if (vfs_getopt(mp->mnt_optnew, "snapshot", NULL, NULL) == 0) { mntorflags |= MNT_SNAPSHOT; /* * Once we have set the MNT_SNAPSHOT flag, do not * persist "snapshot" in the options list. */ vfs_deleteopt(mp->mnt_optnew, "snapshot"); vfs_deleteopt(mp->mnt_opt, "snapshot"); } if (vfs_getopt(mp->mnt_optnew, "fsckpid", NULL, NULL) == 0 && vfs_scanopt(mp->mnt_optnew, "fsckpid", "%d", &fsckpid) == 1) { /* * Once we have set the restricted PID, do not * persist "fsckpid" in the options list. */ vfs_deleteopt(mp->mnt_optnew, "fsckpid"); vfs_deleteopt(mp->mnt_opt, "fsckpid"); if (mp->mnt_flag & MNT_UPDATE) { if (VFSTOUFS(mp)->um_fs->fs_ronly == 0 && vfs_flagopt(mp->mnt_optnew, "ro", NULL, 0) == 0) { vfs_mount_error(mp, "Checker enable: Must be read-only"); return (EINVAL); } } else if (vfs_flagopt(mp->mnt_optnew, "ro", NULL, 0) == 0) { vfs_mount_error(mp, "Checker enable: Must be read-only"); return (EINVAL); } /* Set to -1 if we are done */ if (fsckpid == 0) fsckpid = -1; } if (vfs_getopt(mp->mnt_optnew, "nfsv4acls", NULL, NULL) == 0) { if (mntorflags & MNT_ACLS) { vfs_mount_error(mp, "\"acls\" and \"nfsv4acls\" options " "are mutually exclusive"); return (EINVAL); } mntorflags |= MNT_NFS4ACLS; } MNT_ILOCK(mp); mp->mnt_kern_flag &= ~MNTK_FPLOOKUP; mp->mnt_flag |= mntorflags; MNT_IUNLOCK(mp); /* * If updating, check whether changing from read-only to * read/write; if there is no device name, that's all we do. */ if (mp->mnt_flag & MNT_UPDATE) { ump = VFSTOUFS(mp); fs = ump->um_fs; odevvp = ump->um_odevvp; devvp = ump->um_devvp; if (fsckpid == -1 && ump->um_fsckpid > 0) { if ((error = ffs_flushfiles(mp, WRITECLOSE, td)) != 0 || (error = ffs_sbupdate(ump, MNT_WAIT, 0)) != 0) return (error); g_topology_lock(); /* * Return to normal read-only mode. */ error = g_access(ump->um_cp, 0, -1, 0); g_topology_unlock(); ump->um_fsckpid = 0; } if (fs->fs_ronly == 0 && vfs_flagopt(mp->mnt_optnew, "ro", NULL, 0)) { /* * Flush any dirty data and suspend filesystem. */ if ((error = vn_start_write(NULL, &mp, V_WAIT)) != 0) return (error); error = vfs_write_suspend_umnt(mp); if (error != 0) return (error); /* * Check for and optionally get rid of files open * for writing. */ flags = WRITECLOSE; if (mp->mnt_flag & MNT_FORCE) flags |= FORCECLOSE; if (MOUNTEDSOFTDEP(mp)) { error = softdep_flushfiles(mp, flags, td); } else { error = ffs_flushfiles(mp, flags, td); } if (error) { vfs_write_resume(mp, 0); return (error); } if (fs->fs_pendingblocks != 0 || fs->fs_pendinginodes != 0) { printf("WARNING: %s Update error: blocks %jd " "files %d\n", fs->fs_fsmnt, (intmax_t)fs->fs_pendingblocks, fs->fs_pendinginodes); fs->fs_pendingblocks = 0; fs->fs_pendinginodes = 0; } if ((fs->fs_flags & (FS_UNCLEAN | FS_NEEDSFSCK)) == 0) fs->fs_clean = 1; if ((error = ffs_sbupdate(ump, MNT_WAIT, 0)) != 0) { fs->fs_ronly = 0; fs->fs_clean = 0; vfs_write_resume(mp, 0); return (error); } if (MOUNTEDSOFTDEP(mp)) softdep_unmount(mp); g_topology_lock(); /* * Drop our write and exclusive access. */ g_access(ump->um_cp, 0, -1, -1); g_topology_unlock(); fs->fs_ronly = 1; MNT_ILOCK(mp); mp->mnt_flag |= MNT_RDONLY; MNT_IUNLOCK(mp); /* * Allow the writers to note that filesystem * is ro now. */ vfs_write_resume(mp, 0); } if ((mp->mnt_flag & MNT_RELOAD) && (error = ffs_reload(mp, td, 0)) != 0) return (error); if (fs->fs_ronly && !vfs_flagopt(mp->mnt_optnew, "ro", NULL, 0)) { /* * If we are running a checker, do not allow upgrade. */ if (ump->um_fsckpid > 0) { vfs_mount_error(mp, "Active checker, cannot upgrade to write"); return (EINVAL); } /* * If upgrade to read-write by non-root, then verify * that user has necessary permissions on the device. */ vn_lock(odevvp, LK_EXCLUSIVE | LK_RETRY); error = VOP_ACCESS(odevvp, VREAD | VWRITE, td->td_ucred, td); if (error) error = priv_check(td, PRIV_VFS_MOUNT_PERM); VOP_UNLOCK(odevvp); if (error) { return (error); } fs->fs_flags &= ~FS_UNCLEAN; if (fs->fs_clean == 0) { fs->fs_flags |= FS_UNCLEAN; if ((mp->mnt_flag & MNT_FORCE) || ((fs->fs_flags & (FS_SUJ | FS_NEEDSFSCK)) == 0 && (fs->fs_flags & FS_DOSOFTDEP))) { printf("WARNING: %s was not properly " "dismounted\n", fs->fs_fsmnt); } else { vfs_mount_error(mp, "R/W mount of %s denied. %s.%s", fs->fs_fsmnt, "Filesystem is not clean - run fsck", (fs->fs_flags & FS_SUJ) == 0 ? "" : " Forced mount will invalidate" " journal contents"); return (EPERM); } } g_topology_lock(); /* * Request exclusive write access. */ error = g_access(ump->um_cp, 0, 1, 1); g_topology_unlock(); if (error) return (error); if ((error = vn_start_write(NULL, &mp, V_WAIT)) != 0) return (error); error = vfs_write_suspend_umnt(mp); if (error != 0) return (error); fs->fs_ronly = 0; MNT_ILOCK(mp); saved_mnt_flag = MNT_RDONLY; if (MOUNTEDSOFTDEP(mp) && (mp->mnt_flag & MNT_ASYNC) != 0) saved_mnt_flag |= MNT_ASYNC; mp->mnt_flag &= ~saved_mnt_flag; MNT_IUNLOCK(mp); fs->fs_mtime = time_second; /* check to see if we need to start softdep */ if ((fs->fs_flags & FS_DOSOFTDEP) && (error = softdep_mount(devvp, mp, fs, td->td_ucred))){ fs->fs_ronly = 1; MNT_ILOCK(mp); mp->mnt_flag |= saved_mnt_flag; MNT_IUNLOCK(mp); vfs_write_resume(mp, 0); return (error); } fs->fs_clean = 0; if ((error = ffs_sbupdate(ump, MNT_WAIT, 0)) != 0) { fs->fs_ronly = 1; MNT_ILOCK(mp); mp->mnt_flag |= saved_mnt_flag; MNT_IUNLOCK(mp); vfs_write_resume(mp, 0); return (error); } if (fs->fs_snapinum[0] != 0) ffs_snapshot_mount(mp); vfs_write_resume(mp, 0); } /* * Soft updates is incompatible with "async", * so if we are doing softupdates stop the user * from setting the async flag in an update. * Softdep_mount() clears it in an initial mount * or ro->rw remount. */ if (MOUNTEDSOFTDEP(mp)) { /* XXX: Reset too late ? */ MNT_ILOCK(mp); mp->mnt_flag &= ~MNT_ASYNC; MNT_IUNLOCK(mp); } /* * Keep MNT_ACLS flag if it is stored in superblock. */ if ((fs->fs_flags & FS_ACLS) != 0) { /* XXX: Set too late ? */ MNT_ILOCK(mp); mp->mnt_flag |= MNT_ACLS; MNT_IUNLOCK(mp); } if ((fs->fs_flags & FS_NFS4ACLS) != 0) { /* XXX: Set too late ? */ MNT_ILOCK(mp); mp->mnt_flag |= MNT_NFS4ACLS; MNT_IUNLOCK(mp); } /* * If this is a request from fsck to clean up the filesystem, * then allow the specified pid to proceed. */ if (fsckpid > 0) { if (ump->um_fsckpid != 0) { vfs_mount_error(mp, "Active checker already running on %s", fs->fs_fsmnt); return (EINVAL); } KASSERT(MOUNTEDSOFTDEP(mp) == 0, ("soft updates enabled on read-only file system")); g_topology_lock(); /* * Request write access. */ error = g_access(ump->um_cp, 0, 1, 0); g_topology_unlock(); if (error) { vfs_mount_error(mp, "Checker activation failed on %s", fs->fs_fsmnt); return (error); } ump->um_fsckpid = fsckpid; if (fs->fs_snapinum[0] != 0) ffs_snapshot_mount(mp); fs->fs_mtime = time_second; fs->fs_fmod = 1; fs->fs_clean = 0; (void) ffs_sbupdate(ump, MNT_WAIT, 0); } /* * If this is a snapshot request, take the snapshot. */ if (mp->mnt_flag & MNT_SNAPSHOT) return (ffs_snapshot(mp, fspec)); /* * Must not call namei() while owning busy ref. */ vfs_unbusy(mp); } /* * Not an update, or updating the name: look up the name * and verify that it refers to a sensible disk device. */ NDINIT(&ndp, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, fspec, td); error = namei(&ndp); if ((mp->mnt_flag & MNT_UPDATE) != 0) { /* * Unmount does not start if MNT_UPDATE is set. Mount * update busies mp before setting MNT_UPDATE. We * must be able to retain our busy ref succesfully, * without sleep. */ error1 = vfs_busy(mp, MBF_NOWAIT); MPASS(error1 == 0); } if (error != 0) return (error); NDFREE(&ndp, NDF_ONLY_PNBUF); devvp = ndp.ni_vp; - if (!vn_isdisk(devvp, &error)) { + if (!vn_isdisk_error(devvp, &error)) { vput(devvp); return (error); } /* * If mount by non-root, then verify that user has necessary * permissions on the device. */ accmode = VREAD; if ((mp->mnt_flag & MNT_RDONLY) == 0) accmode |= VWRITE; error = VOP_ACCESS(devvp, accmode, td->td_ucred, td); if (error) error = priv_check(td, PRIV_VFS_MOUNT_PERM); if (error) { vput(devvp); return (error); } if (mp->mnt_flag & MNT_UPDATE) { /* * Update only * * If it's not the same vnode, or at least the same device * then it's not correct. */ if (devvp->v_rdev != ump->um_devvp->v_rdev) error = EINVAL; /* needs translation */ vput(devvp); if (error) return (error); } else { /* * New mount * * We need the name for the mount point (also used for * "last mounted on") copied in. If an error occurs, * the mount point is discarded by the upper level code. * Note that vfs_mount_alloc() populates f_mntonname for us. */ if ((error = ffs_mountfs(devvp, mp, td)) != 0) { vrele(devvp); return (error); } if (fsckpid > 0) { KASSERT(MOUNTEDSOFTDEP(mp) == 0, ("soft updates enabled on read-only file system")); ump = VFSTOUFS(mp); fs = ump->um_fs; g_topology_lock(); /* * Request write access. */ error = g_access(ump->um_cp, 0, 1, 0); g_topology_unlock(); if (error) { printf("WARNING: %s: Checker activation " "failed\n", fs->fs_fsmnt); } else { ump->um_fsckpid = fsckpid; if (fs->fs_snapinum[0] != 0) ffs_snapshot_mount(mp); fs->fs_mtime = time_second; fs->fs_clean = 0; (void) ffs_sbupdate(ump, MNT_WAIT, 0); } } } MNT_ILOCK(mp); /* * This is racy versus lookup, see ufs_fplookup_vexec for details. */ if ((mp->mnt_kern_flag & MNTK_FPLOOKUP) != 0) panic("MNTK_FPLOOKUP set on mount %p when it should not be", mp); if ((mp->mnt_flag & (MNT_ACLS | MNT_NFS4ACLS | MNT_UNION)) == 0) mp->mnt_kern_flag |= MNTK_FPLOOKUP; MNT_IUNLOCK(mp); vfs_mountedfrom(mp, fspec); return (0); } /* * Compatibility with old mount system call. */ static int ffs_cmount(struct mntarg *ma, void *data, uint64_t flags) { struct ufs_args args; int error; if (data == NULL) return (EINVAL); error = copyin(data, &args, sizeof args); if (error) return (error); ma = mount_argsu(ma, "from", args.fspec, MAXPATHLEN); ma = mount_arg(ma, "export", &args.export, sizeof(args.export)); error = kernel_mount(ma, flags); return (error); } /* * Reload all incore data for a filesystem (used after running fsck on * the root filesystem and finding things to fix). If the 'force' flag * is 0, the filesystem must be mounted read-only. * * Things to do to update the mount: * 1) invalidate all cached meta-data. * 2) re-read superblock from disk. * 3) re-read summary information from disk. * 4) invalidate all inactive vnodes. * 5) clear MNTK_SUSPEND2 and MNTK_SUSPENDED flags, allowing secondary * writers, if requested. * 6) invalidate all cached file data. * 7) re-read inode data for all active vnodes. */ int ffs_reload(struct mount *mp, struct thread *td, int flags) { struct vnode *vp, *mvp, *devvp; struct inode *ip; void *space; struct buf *bp; struct fs *fs, *newfs; struct ufsmount *ump; ufs2_daddr_t sblockloc; int i, blks, error; u_long size; int32_t *lp; ump = VFSTOUFS(mp); MNT_ILOCK(mp); if ((mp->mnt_flag & MNT_RDONLY) == 0 && (flags & FFSR_FORCE) == 0) { MNT_IUNLOCK(mp); return (EINVAL); } MNT_IUNLOCK(mp); /* * Step 1: invalidate all cached meta-data. */ devvp = VFSTOUFS(mp)->um_devvp; vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY); if (vinvalbuf(devvp, 0, 0, 0) != 0) panic("ffs_reload: dirty1"); VOP_UNLOCK(devvp); /* * Step 2: re-read superblock from disk. */ fs = VFSTOUFS(mp)->um_fs; if ((error = bread(devvp, btodb(fs->fs_sblockloc), fs->fs_sbsize, NOCRED, &bp)) != 0) return (error); newfs = (struct fs *)bp->b_data; if ((newfs->fs_magic != FS_UFS1_MAGIC && newfs->fs_magic != FS_UFS2_MAGIC) || newfs->fs_bsize > MAXBSIZE || newfs->fs_bsize < sizeof(struct fs)) { brelse(bp); return (EIO); /* XXX needs translation */ } /* * Preserve the summary information, read-only status, and * superblock location by copying these fields into our new * superblock before using it to update the existing superblock. */ newfs->fs_si = fs->fs_si; newfs->fs_ronly = fs->fs_ronly; sblockloc = fs->fs_sblockloc; bcopy(newfs, fs, (u_int)fs->fs_sbsize); brelse(bp); mp->mnt_maxsymlinklen = fs->fs_maxsymlinklen; ffs_oldfscompat_read(fs, VFSTOUFS(mp), sblockloc); UFS_LOCK(ump); if (fs->fs_pendingblocks != 0 || fs->fs_pendinginodes != 0) { printf("WARNING: %s: reload pending error: blocks %jd " "files %d\n", fs->fs_fsmnt, (intmax_t)fs->fs_pendingblocks, fs->fs_pendinginodes); fs->fs_pendingblocks = 0; fs->fs_pendinginodes = 0; } UFS_UNLOCK(ump); /* * Step 3: re-read summary information from disk. */ size = fs->fs_cssize; blks = howmany(size, fs->fs_fsize); if (fs->fs_contigsumsize > 0) size += fs->fs_ncg * sizeof(int32_t); size += fs->fs_ncg * sizeof(u_int8_t); free(fs->fs_csp, M_UFSMNT); space = malloc(size, M_UFSMNT, M_WAITOK); fs->fs_csp = space; for (i = 0; i < blks; i += fs->fs_frag) { size = fs->fs_bsize; if (i + fs->fs_frag > blks) size = (blks - i) * fs->fs_fsize; error = bread(devvp, fsbtodb(fs, fs->fs_csaddr + i), size, NOCRED, &bp); if (error) return (error); bcopy(bp->b_data, space, (u_int)size); space = (char *)space + size; brelse(bp); } /* * We no longer know anything about clusters per cylinder group. */ if (fs->fs_contigsumsize > 0) { fs->fs_maxcluster = lp = space; for (i = 0; i < fs->fs_ncg; i++) *lp++ = fs->fs_contigsumsize; space = lp; } size = fs->fs_ncg * sizeof(u_int8_t); fs->fs_contigdirs = (u_int8_t *)space; bzero(fs->fs_contigdirs, size); if ((flags & FFSR_UNSUSPEND) != 0) { MNT_ILOCK(mp); mp->mnt_kern_flag &= ~(MNTK_SUSPENDED | MNTK_SUSPEND2); wakeup(&mp->mnt_flag); MNT_IUNLOCK(mp); } loop: MNT_VNODE_FOREACH_ALL(vp, mp, mvp) { /* * Skip syncer vnode. */ if (vp->v_type == VNON) { VI_UNLOCK(vp); continue; } /* * Step 4: invalidate all cached file data. */ if (vget(vp, LK_EXCLUSIVE | LK_INTERLOCK)) { MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp); goto loop; } if (vinvalbuf(vp, 0, 0, 0)) panic("ffs_reload: dirty2"); /* * Step 5: re-read inode data for all active vnodes. */ ip = VTOI(vp); error = bread(devvp, fsbtodb(fs, ino_to_fsba(fs, ip->i_number)), (int)fs->fs_bsize, NOCRED, &bp); if (error) { vput(vp); MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp); return (error); } if ((error = ffs_load_inode(bp, ip, fs, ip->i_number)) != 0) { brelse(bp); vput(vp); MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp); return (error); } ip->i_effnlink = ip->i_nlink; brelse(bp); vput(vp); } return (0); } /* * Common code for mount and mountroot */ static int ffs_mountfs(odevvp, mp, td) struct vnode *odevvp; struct mount *mp; struct thread *td; { struct ufsmount *ump; struct fs *fs; struct cdev *dev; int error, i, len, ronly; struct ucred *cred; struct g_consumer *cp; struct mount *nmp; struct vnode *devvp; struct fsfail_task *etp; int candelete, canspeedup; off_t loc; fs = NULL; ump = NULL; cred = td ? td->td_ucred : NOCRED; ronly = (mp->mnt_flag & MNT_RDONLY) != 0; devvp = mntfs_allocvp(mp, odevvp); VOP_UNLOCK(odevvp); KASSERT(devvp->v_type == VCHR, ("reclaimed devvp")); dev = devvp->v_rdev; if (atomic_cmpset_acq_ptr((uintptr_t *)&dev->si_mountpt, 0, (uintptr_t)mp) == 0) { mntfs_freevp(devvp); return (EBUSY); } g_topology_lock(); error = g_vfs_open(devvp, &cp, "ffs", ronly ? 0 : 1); g_topology_unlock(); if (error != 0) { atomic_store_rel_ptr((uintptr_t *)&dev->si_mountpt, 0); mntfs_freevp(devvp); return (error); } dev_ref(dev); devvp->v_bufobj.bo_ops = &ffs_ops; BO_LOCK(&odevvp->v_bufobj); odevvp->v_bufobj.bo_flag |= BO_NOBUFS; BO_UNLOCK(&odevvp->v_bufobj); if (dev->si_iosize_max != 0) mp->mnt_iosize_max = dev->si_iosize_max; if (mp->mnt_iosize_max > MAXPHYS) mp->mnt_iosize_max = MAXPHYS; if ((SBLOCKSIZE % cp->provider->sectorsize) != 0) { error = EINVAL; vfs_mount_error(mp, "Invalid sectorsize %d for superblock size %d", cp->provider->sectorsize, SBLOCKSIZE); goto out; } /* fetch the superblock and summary information */ loc = STDSB; if ((mp->mnt_flag & MNT_ROOTFS) != 0) loc = STDSB_NOHASHFAIL; if ((error = ffs_sbget(devvp, &fs, loc, M_UFSMNT, ffs_use_bread)) != 0) goto out; /* none of these types of check-hashes are maintained by this kernel */ fs->fs_metackhash &= ~(CK_INDIR | CK_DIR); /* no support for any undefined flags */ fs->fs_flags &= FS_SUPPORTED; fs->fs_flags &= ~FS_UNCLEAN; if (fs->fs_clean == 0) { fs->fs_flags |= FS_UNCLEAN; if (ronly || (mp->mnt_flag & MNT_FORCE) || ((fs->fs_flags & (FS_SUJ | FS_NEEDSFSCK)) == 0 && (fs->fs_flags & FS_DOSOFTDEP))) { printf("WARNING: %s was not properly dismounted\n", fs->fs_fsmnt); } else { vfs_mount_error(mp, "R/W mount of %s denied. %s%s", fs->fs_fsmnt, "Filesystem is not clean - run fsck.", (fs->fs_flags & FS_SUJ) == 0 ? "" : " Forced mount will invalidate journal contents"); error = EPERM; goto out; } if ((fs->fs_pendingblocks != 0 || fs->fs_pendinginodes != 0) && (mp->mnt_flag & MNT_FORCE)) { printf("WARNING: %s: lost blocks %jd files %d\n", fs->fs_fsmnt, (intmax_t)fs->fs_pendingblocks, fs->fs_pendinginodes); fs->fs_pendingblocks = 0; fs->fs_pendinginodes = 0; } } if (fs->fs_pendingblocks != 0 || fs->fs_pendinginodes != 0) { printf("WARNING: %s: mount pending error: blocks %jd " "files %d\n", fs->fs_fsmnt, (intmax_t)fs->fs_pendingblocks, fs->fs_pendinginodes); fs->fs_pendingblocks = 0; fs->fs_pendinginodes = 0; } if ((fs->fs_flags & FS_GJOURNAL) != 0) { #ifdef UFS_GJOURNAL /* * Get journal provider name. */ len = 1024; mp->mnt_gjprovider = malloc((u_long)len, M_UFSMNT, M_WAITOK); if (g_io_getattr("GJOURNAL::provider", cp, &len, mp->mnt_gjprovider) == 0) { mp->mnt_gjprovider = realloc(mp->mnt_gjprovider, len, M_UFSMNT, M_WAITOK); MNT_ILOCK(mp); mp->mnt_flag |= MNT_GJOURNAL; MNT_IUNLOCK(mp); } else { printf("WARNING: %s: GJOURNAL flag on fs " "but no gjournal provider below\n", mp->mnt_stat.f_mntonname); free(mp->mnt_gjprovider, M_UFSMNT); mp->mnt_gjprovider = NULL; } #else printf("WARNING: %s: GJOURNAL flag on fs but no " "UFS_GJOURNAL support\n", mp->mnt_stat.f_mntonname); #endif } else { mp->mnt_gjprovider = NULL; } ump = malloc(sizeof *ump, M_UFSMNT, M_WAITOK | M_ZERO); ump->um_cp = cp; ump->um_bo = &devvp->v_bufobj; ump->um_fs = fs; if (fs->fs_magic == FS_UFS1_MAGIC) { ump->um_fstype = UFS1; ump->um_balloc = ffs_balloc_ufs1; } else { ump->um_fstype = UFS2; ump->um_balloc = ffs_balloc_ufs2; } ump->um_blkatoff = ffs_blkatoff; ump->um_truncate = ffs_truncate; ump->um_update = ffs_update; ump->um_valloc = ffs_valloc; ump->um_vfree = ffs_vfree; ump->um_ifree = ffs_ifree; ump->um_rdonly = ffs_rdonly; ump->um_snapgone = ffs_snapgone; if ((mp->mnt_flag & MNT_UNTRUSTED) != 0) ump->um_check_blkno = ffs_check_blkno; else ump->um_check_blkno = NULL; mtx_init(UFS_MTX(ump), "FFS", "FFS Lock", MTX_DEF); ffs_oldfscompat_read(fs, ump, fs->fs_sblockloc); fs->fs_ronly = ronly; fs->fs_active = NULL; mp->mnt_data = ump; mp->mnt_stat.f_fsid.val[0] = fs->fs_id[0]; mp->mnt_stat.f_fsid.val[1] = fs->fs_id[1]; nmp = NULL; if (fs->fs_id[0] == 0 || fs->fs_id[1] == 0 || (nmp = vfs_getvfs(&mp->mnt_stat.f_fsid))) { if (nmp) vfs_rel(nmp); vfs_getnewfsid(mp); } mp->mnt_maxsymlinklen = fs->fs_maxsymlinklen; MNT_ILOCK(mp); mp->mnt_flag |= MNT_LOCAL; MNT_IUNLOCK(mp); if ((fs->fs_flags & FS_MULTILABEL) != 0) { #ifdef MAC MNT_ILOCK(mp); mp->mnt_flag |= MNT_MULTILABEL; MNT_IUNLOCK(mp); #else printf("WARNING: %s: multilabel flag on fs but " "no MAC support\n", mp->mnt_stat.f_mntonname); #endif } if ((fs->fs_flags & FS_ACLS) != 0) { #ifdef UFS_ACL MNT_ILOCK(mp); if (mp->mnt_flag & MNT_NFS4ACLS) printf("WARNING: %s: ACLs flag on fs conflicts with " "\"nfsv4acls\" mount option; option ignored\n", mp->mnt_stat.f_mntonname); mp->mnt_flag &= ~MNT_NFS4ACLS; mp->mnt_flag |= MNT_ACLS; MNT_IUNLOCK(mp); #else printf("WARNING: %s: ACLs flag on fs but no ACLs support\n", mp->mnt_stat.f_mntonname); #endif } if ((fs->fs_flags & FS_NFS4ACLS) != 0) { #ifdef UFS_ACL MNT_ILOCK(mp); if (mp->mnt_flag & MNT_ACLS) printf("WARNING: %s: NFSv4 ACLs flag on fs conflicts " "with \"acls\" mount option; option ignored\n", mp->mnt_stat.f_mntonname); mp->mnt_flag &= ~MNT_ACLS; mp->mnt_flag |= MNT_NFS4ACLS; MNT_IUNLOCK(mp); #else printf("WARNING: %s: NFSv4 ACLs flag on fs but no " "ACLs support\n", mp->mnt_stat.f_mntonname); #endif } if ((fs->fs_flags & FS_TRIM) != 0) { len = sizeof(int); if (g_io_getattr("GEOM::candelete", cp, &len, &candelete) == 0) { if (candelete) ump->um_flags |= UM_CANDELETE; else printf("WARNING: %s: TRIM flag on fs but disk " "does not support TRIM\n", mp->mnt_stat.f_mntonname); } else { printf("WARNING: %s: TRIM flag on fs but disk does " "not confirm that it supports TRIM\n", mp->mnt_stat.f_mntonname); } if (((ump->um_flags) & UM_CANDELETE) != 0) { ump->um_trim_tq = taskqueue_create("trim", M_WAITOK, taskqueue_thread_enqueue, &ump->um_trim_tq); taskqueue_start_threads(&ump->um_trim_tq, 1, PVFS, "%s trim", mp->mnt_stat.f_mntonname); ump->um_trimhash = hashinit(MAXTRIMIO, M_TRIM, &ump->um_trimlisthashsize); } } len = sizeof(int); if (g_io_getattr("GEOM::canspeedup", cp, &len, &canspeedup) == 0) { if (canspeedup) ump->um_flags |= UM_CANSPEEDUP; } ump->um_mountp = mp; ump->um_dev = dev; ump->um_devvp = devvp; ump->um_odevvp = odevvp; ump->um_nindir = fs->fs_nindir; ump->um_bptrtodb = fs->fs_fsbtodb; ump->um_seqinc = fs->fs_frag; for (i = 0; i < MAXQUOTAS; i++) ump->um_quotas[i] = NULLVP; #ifdef UFS_EXTATTR ufs_extattr_uepm_init(&ump->um_extattr); #endif /* * Set FS local "last mounted on" information (NULL pad) */ bzero(fs->fs_fsmnt, MAXMNTLEN); strlcpy(fs->fs_fsmnt, mp->mnt_stat.f_mntonname, MAXMNTLEN); mp->mnt_stat.f_iosize = fs->fs_bsize; if (mp->mnt_flag & MNT_ROOTFS) { /* * Root mount; update timestamp in mount structure. * this will be used by the common root mount code * to update the system clock. */ mp->mnt_time = fs->fs_time; } if (ronly == 0) { fs->fs_mtime = time_second; if ((fs->fs_flags & FS_DOSOFTDEP) && (error = softdep_mount(devvp, mp, fs, cred)) != 0) { ffs_flushfiles(mp, FORCECLOSE, td); goto out; } if (fs->fs_snapinum[0] != 0) ffs_snapshot_mount(mp); fs->fs_fmod = 1; fs->fs_clean = 0; (void) ffs_sbupdate(ump, MNT_WAIT, 0); } /* * Initialize filesystem state information in mount struct. */ MNT_ILOCK(mp); mp->mnt_kern_flag |= MNTK_LOOKUP_SHARED | MNTK_EXTENDED_SHARED | MNTK_NO_IOPF | MNTK_UNMAPPED_BUFS | MNTK_USES_BCACHE; MNT_IUNLOCK(mp); #ifdef UFS_EXTATTR #ifdef UFS_EXTATTR_AUTOSTART /* * * Auto-starting does the following: * - check for /.attribute in the fs, and extattr_start if so * - for each file in .attribute, enable that file with * an attribute of the same name. * Not clear how to report errors -- probably eat them. * This would all happen while the filesystem was busy/not * available, so would effectively be "atomic". */ (void) ufs_extattr_autostart(mp, td); #endif /* !UFS_EXTATTR_AUTOSTART */ #endif /* !UFS_EXTATTR */ etp = malloc(sizeof *ump->um_fsfail_task, M_UFSMNT, M_WAITOK | M_ZERO); etp->fsid = mp->mnt_stat.f_fsid; ump->um_fsfail_task = etp; return (0); out: if (fs != NULL) { free(fs->fs_csp, M_UFSMNT); free(fs->fs_si, M_UFSMNT); free(fs, M_UFSMNT); } if (cp != NULL) { g_topology_lock(); g_vfs_close(cp); g_topology_unlock(); } if (ump) { mtx_destroy(UFS_MTX(ump)); if (mp->mnt_gjprovider != NULL) { free(mp->mnt_gjprovider, M_UFSMNT); mp->mnt_gjprovider = NULL; } free(ump, M_UFSMNT); mp->mnt_data = NULL; } BO_LOCK(&odevvp->v_bufobj); odevvp->v_bufobj.bo_flag &= ~BO_NOBUFS; BO_UNLOCK(&odevvp->v_bufobj); atomic_store_rel_ptr((uintptr_t *)&dev->si_mountpt, 0); mntfs_freevp(devvp); dev_rel(dev); return (error); } /* * A read function for use by filesystem-layer routines. */ static int ffs_use_bread(void *devfd, off_t loc, void **bufp, int size) { struct buf *bp; int error; KASSERT(*bufp == NULL, ("ffs_use_bread: non-NULL *bufp %p\n", *bufp)); *bufp = malloc(size, M_UFSMNT, M_WAITOK); if ((error = bread((struct vnode *)devfd, btodb(loc), size, NOCRED, &bp)) != 0) return (error); bcopy(bp->b_data, *bufp, size); bp->b_flags |= B_INVAL | B_NOCACHE; brelse(bp); return (0); } static int bigcgs = 0; SYSCTL_INT(_debug, OID_AUTO, bigcgs, CTLFLAG_RW, &bigcgs, 0, ""); /* * Sanity checks for loading old filesystem superblocks. * See ffs_oldfscompat_write below for unwound actions. * * XXX - Parts get retired eventually. * Unfortunately new bits get added. */ static void ffs_oldfscompat_read(fs, ump, sblockloc) struct fs *fs; struct ufsmount *ump; ufs2_daddr_t sblockloc; { off_t maxfilesize; /* * If not yet done, update fs_flags location and value of fs_sblockloc. */ if ((fs->fs_old_flags & FS_FLAGS_UPDATED) == 0) { fs->fs_flags = fs->fs_old_flags; fs->fs_old_flags |= FS_FLAGS_UPDATED; fs->fs_sblockloc = sblockloc; } /* * If not yet done, update UFS1 superblock with new wider fields. */ if (fs->fs_magic == FS_UFS1_MAGIC && fs->fs_maxbsize != fs->fs_bsize) { fs->fs_maxbsize = fs->fs_bsize; fs->fs_time = fs->fs_old_time; fs->fs_size = fs->fs_old_size; fs->fs_dsize = fs->fs_old_dsize; fs->fs_csaddr = fs->fs_old_csaddr; fs->fs_cstotal.cs_ndir = fs->fs_old_cstotal.cs_ndir; fs->fs_cstotal.cs_nbfree = fs->fs_old_cstotal.cs_nbfree; fs->fs_cstotal.cs_nifree = fs->fs_old_cstotal.cs_nifree; fs->fs_cstotal.cs_nffree = fs->fs_old_cstotal.cs_nffree; } if (fs->fs_magic == FS_UFS1_MAGIC && fs->fs_old_inodefmt < FS_44INODEFMT) { fs->fs_maxfilesize = ((uint64_t)1 << 31) - 1; fs->fs_qbmask = ~fs->fs_bmask; fs->fs_qfmask = ~fs->fs_fmask; } if (fs->fs_magic == FS_UFS1_MAGIC) { ump->um_savedmaxfilesize = fs->fs_maxfilesize; maxfilesize = (uint64_t)0x80000000 * fs->fs_bsize - 1; if (fs->fs_maxfilesize > maxfilesize) fs->fs_maxfilesize = maxfilesize; } /* Compatibility for old filesystems */ if (fs->fs_avgfilesize <= 0) fs->fs_avgfilesize = AVFILESIZ; if (fs->fs_avgfpdir <= 0) fs->fs_avgfpdir = AFPDIR; if (bigcgs) { fs->fs_save_cgsize = fs->fs_cgsize; fs->fs_cgsize = fs->fs_bsize; } } /* * Unwinding superblock updates for old filesystems. * See ffs_oldfscompat_read above for details. * * XXX - Parts get retired eventually. * Unfortunately new bits get added. */ void ffs_oldfscompat_write(fs, ump) struct fs *fs; struct ufsmount *ump; { /* * Copy back UFS2 updated fields that UFS1 inspects. */ if (fs->fs_magic == FS_UFS1_MAGIC) { fs->fs_old_time = fs->fs_time; fs->fs_old_cstotal.cs_ndir = fs->fs_cstotal.cs_ndir; fs->fs_old_cstotal.cs_nbfree = fs->fs_cstotal.cs_nbfree; fs->fs_old_cstotal.cs_nifree = fs->fs_cstotal.cs_nifree; fs->fs_old_cstotal.cs_nffree = fs->fs_cstotal.cs_nffree; fs->fs_maxfilesize = ump->um_savedmaxfilesize; } if (bigcgs) { fs->fs_cgsize = fs->fs_save_cgsize; fs->fs_save_cgsize = 0; } } /* * unmount system call */ static int ffs_unmount(mp, mntflags) struct mount *mp; int mntflags; { struct thread *td; struct ufsmount *ump = VFSTOUFS(mp); struct fs *fs; int error, flags, susp; #ifdef UFS_EXTATTR int e_restart; #endif flags = 0; td = curthread; fs = ump->um_fs; if (mntflags & MNT_FORCE) flags |= FORCECLOSE; susp = fs->fs_ronly == 0; #ifdef UFS_EXTATTR if ((error = ufs_extattr_stop(mp, td))) { if (error != EOPNOTSUPP) printf("WARNING: unmount %s: ufs_extattr_stop " "returned errno %d\n", mp->mnt_stat.f_mntonname, error); e_restart = 0; } else { ufs_extattr_uepm_destroy(&ump->um_extattr); e_restart = 1; } #endif if (susp) { error = vfs_write_suspend_umnt(mp); if (error != 0) goto fail1; } if (MOUNTEDSOFTDEP(mp)) error = softdep_flushfiles(mp, flags, td); else error = ffs_flushfiles(mp, flags, td); if (error != 0 && !ffs_fsfail_cleanup(ump, error)) goto fail; UFS_LOCK(ump); if (fs->fs_pendingblocks != 0 || fs->fs_pendinginodes != 0) { printf("WARNING: unmount %s: pending error: blocks %jd " "files %d\n", fs->fs_fsmnt, (intmax_t)fs->fs_pendingblocks, fs->fs_pendinginodes); fs->fs_pendingblocks = 0; fs->fs_pendinginodes = 0; } UFS_UNLOCK(ump); if (MOUNTEDSOFTDEP(mp)) softdep_unmount(mp); if (fs->fs_ronly == 0 || ump->um_fsckpid > 0) { fs->fs_clean = fs->fs_flags & (FS_UNCLEAN|FS_NEEDSFSCK) ? 0 : 1; error = ffs_sbupdate(ump, MNT_WAIT, 0); if (ffs_fsfail_cleanup(ump, error)) error = 0; if (error != 0 && !ffs_fsfail_cleanup(ump, error)) { fs->fs_clean = 0; goto fail; } } if (susp) vfs_write_resume(mp, VR_START_WRITE); if (ump->um_trim_tq != NULL) { while (ump->um_trim_inflight != 0) pause("ufsutr", hz); taskqueue_drain_all(ump->um_trim_tq); taskqueue_free(ump->um_trim_tq); free (ump->um_trimhash, M_TRIM); } g_topology_lock(); if (ump->um_fsckpid > 0) { /* * Return to normal read-only mode. */ error = g_access(ump->um_cp, 0, -1, 0); ump->um_fsckpid = 0; } g_vfs_close(ump->um_cp); g_topology_unlock(); BO_LOCK(&ump->um_odevvp->v_bufobj); ump->um_odevvp->v_bufobj.bo_flag &= ~BO_NOBUFS; BO_UNLOCK(&ump->um_odevvp->v_bufobj); atomic_store_rel_ptr((uintptr_t *)&ump->um_dev->si_mountpt, 0); mntfs_freevp(ump->um_devvp); vrele(ump->um_odevvp); dev_rel(ump->um_dev); mtx_destroy(UFS_MTX(ump)); if (mp->mnt_gjprovider != NULL) { free(mp->mnt_gjprovider, M_UFSMNT); mp->mnt_gjprovider = NULL; } free(fs->fs_csp, M_UFSMNT); free(fs->fs_si, M_UFSMNT); free(fs, M_UFSMNT); if (ump->um_fsfail_task != NULL) free(ump->um_fsfail_task, M_UFSMNT); free(ump, M_UFSMNT); mp->mnt_data = NULL; MNT_ILOCK(mp); mp->mnt_flag &= ~MNT_LOCAL; MNT_IUNLOCK(mp); if (td->td_su == mp) { td->td_su = NULL; vfs_rel(mp); } return (error); fail: if (susp) vfs_write_resume(mp, VR_START_WRITE); fail1: #ifdef UFS_EXTATTR if (e_restart) { ufs_extattr_uepm_init(&ump->um_extattr); #ifdef UFS_EXTATTR_AUTOSTART (void) ufs_extattr_autostart(mp, td); #endif } #endif return (error); } /* * Flush out all the files in a filesystem. */ int ffs_flushfiles(mp, flags, td) struct mount *mp; int flags; struct thread *td; { struct ufsmount *ump; int qerror, error; ump = VFSTOUFS(mp); qerror = 0; #ifdef QUOTA if (mp->mnt_flag & MNT_QUOTA) { int i; error = vflush(mp, 0, SKIPSYSTEM|flags, td); if (error) return (error); for (i = 0; i < MAXQUOTAS; i++) { error = quotaoff(td, mp, i); if (error != 0) { if ((flags & EARLYFLUSH) == 0) return (error); else qerror = error; } } /* * Here we fall through to vflush again to ensure that * we have gotten rid of all the system vnodes, unless * quotas must not be closed. */ } #endif ASSERT_VOP_LOCKED(ump->um_devvp, "ffs_flushfiles"); if (ump->um_devvp->v_vflag & VV_COPYONWRITE) { if ((error = vflush(mp, 0, SKIPSYSTEM | flags, td)) != 0) return (error); ffs_snapshot_unmount(mp); flags |= FORCECLOSE; /* * Here we fall through to vflush again to ensure * that we have gotten rid of all the system vnodes. */ } /* * Do not close system files if quotas were not closed, to be * able to sync the remaining dquots. The freeblks softupdate * workitems might hold a reference on a dquot, preventing * quotaoff() from completing. Next round of * softdep_flushworklist() iteration should process the * blockers, allowing the next run of quotaoff() to finally * flush held dquots. * * Otherwise, flush all the files. */ if (qerror == 0 && (error = vflush(mp, 0, flags, td)) != 0) return (error); /* * Flush filesystem metadata. */ vn_lock(ump->um_devvp, LK_EXCLUSIVE | LK_RETRY); error = VOP_FSYNC(ump->um_devvp, MNT_WAIT, td); VOP_UNLOCK(ump->um_devvp); return (error); } /* * Get filesystem statistics. */ static int ffs_statfs(mp, sbp) struct mount *mp; struct statfs *sbp; { struct ufsmount *ump; struct fs *fs; ump = VFSTOUFS(mp); fs = ump->um_fs; if (fs->fs_magic != FS_UFS1_MAGIC && fs->fs_magic != FS_UFS2_MAGIC) panic("ffs_statfs"); sbp->f_version = STATFS_VERSION; sbp->f_bsize = fs->fs_fsize; sbp->f_iosize = fs->fs_bsize; sbp->f_blocks = fs->fs_dsize; UFS_LOCK(ump); sbp->f_bfree = fs->fs_cstotal.cs_nbfree * fs->fs_frag + fs->fs_cstotal.cs_nffree + dbtofsb(fs, fs->fs_pendingblocks); sbp->f_bavail = freespace(fs, fs->fs_minfree) + dbtofsb(fs, fs->fs_pendingblocks); sbp->f_files = fs->fs_ncg * fs->fs_ipg - UFS_ROOTINO; sbp->f_ffree = fs->fs_cstotal.cs_nifree + fs->fs_pendinginodes; UFS_UNLOCK(ump); sbp->f_namemax = UFS_MAXNAMLEN; return (0); } static bool sync_doupdate(struct inode *ip) { return ((ip->i_flag & (IN_ACCESS | IN_CHANGE | IN_MODIFIED | IN_UPDATE)) != 0); } static int ffs_sync_lazy_filter(struct vnode *vp, void *arg __unused) { struct inode *ip; /* * Flags are safe to access because ->v_data invalidation * is held off by listmtx. */ if (vp->v_type == VNON) return (false); ip = VTOI(vp); if (!sync_doupdate(ip) && (vp->v_iflag & VI_OWEINACT) == 0) return (false); return (true); } /* * For a lazy sync, we only care about access times, quotas and the * superblock. Other filesystem changes are already converted to * cylinder group blocks or inode blocks updates and are written to * disk by syncer. */ static int ffs_sync_lazy(mp) struct mount *mp; { struct vnode *mvp, *vp; struct inode *ip; struct thread *td; int allerror, error; allerror = 0; td = curthread; if ((mp->mnt_flag & MNT_NOATIME) != 0) { #ifdef QUOTA qsync(mp); #endif goto sbupdate; } MNT_VNODE_FOREACH_LAZY(vp, mp, mvp, ffs_sync_lazy_filter, NULL) { if (vp->v_type == VNON) { VI_UNLOCK(vp); continue; } ip = VTOI(vp); /* * The IN_ACCESS flag is converted to IN_MODIFIED by * ufs_close() and ufs_getattr() by the calls to * ufs_itimes_locked(), without subsequent UFS_UPDATE(). * Test also all the other timestamp flags too, to pick up * any other cases that could be missed. */ if (!sync_doupdate(ip) && (vp->v_iflag & VI_OWEINACT) == 0) { VI_UNLOCK(vp); continue; } if ((error = vget(vp, LK_EXCLUSIVE | LK_NOWAIT | LK_INTERLOCK)) != 0) continue; #ifdef QUOTA qsyncvp(vp); #endif if (sync_doupdate(ip)) error = ffs_update(vp, 0); if (error != 0) allerror = error; vput(vp); } sbupdate: if (VFSTOUFS(mp)->um_fs->fs_fmod != 0 && (error = ffs_sbupdate(VFSTOUFS(mp), MNT_LAZY, 0)) != 0) allerror = error; return (allerror); } /* * Go through the disk queues to initiate sandbagged IO; * go through the inodes to write those that have been modified; * initiate the writing of the super block if it has been modified. * * Note: we are always called with the filesystem marked busy using * vfs_busy(). */ static int ffs_sync(mp, waitfor) struct mount *mp; int waitfor; { struct vnode *mvp, *vp, *devvp; struct thread *td; struct inode *ip; struct ufsmount *ump = VFSTOUFS(mp); struct fs *fs; int error, count, lockreq, allerror = 0; int suspend; int suspended; int secondary_writes; int secondary_accwrites; int softdep_deps; int softdep_accdeps; struct bufobj *bo; suspend = 0; suspended = 0; td = curthread; fs = ump->um_fs; if (fs->fs_fmod != 0 && fs->fs_ronly != 0 && ump->um_fsckpid == 0) panic("%s: ffs_sync: modification on read-only filesystem", fs->fs_fsmnt); if (waitfor == MNT_LAZY) { if (!rebooting) return (ffs_sync_lazy(mp)); waitfor = MNT_NOWAIT; } /* * Write back each (modified) inode. */ lockreq = LK_EXCLUSIVE | LK_NOWAIT; if (waitfor == MNT_SUSPEND) { suspend = 1; waitfor = MNT_WAIT; } if (waitfor == MNT_WAIT) lockreq = LK_EXCLUSIVE; lockreq |= LK_INTERLOCK | LK_SLEEPFAIL; loop: /* Grab snapshot of secondary write counts */ MNT_ILOCK(mp); secondary_writes = mp->mnt_secondary_writes; secondary_accwrites = mp->mnt_secondary_accwrites; MNT_IUNLOCK(mp); /* Grab snapshot of softdep dependency counts */ softdep_get_depcounts(mp, &softdep_deps, &softdep_accdeps); MNT_VNODE_FOREACH_ALL(vp, mp, mvp) { /* * Depend on the vnode interlock to keep things stable enough * for a quick test. Since there might be hundreds of * thousands of vnodes, we cannot afford even a subroutine * call unless there's a good chance that we have work to do. */ if (vp->v_type == VNON) { VI_UNLOCK(vp); continue; } ip = VTOI(vp); if ((ip->i_flag & (IN_ACCESS | IN_CHANGE | IN_MODIFIED | IN_UPDATE)) == 0 && vp->v_bufobj.bo_dirty.bv_cnt == 0) { VI_UNLOCK(vp); continue; } if ((error = vget(vp, lockreq)) != 0) { if (error == ENOENT || error == ENOLCK) { MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp); goto loop; } continue; } #ifdef QUOTA qsyncvp(vp); #endif if ((error = ffs_syncvnode(vp, waitfor, 0)) != 0) allerror = error; vput(vp); } /* * Force stale filesystem control information to be flushed. */ if (waitfor == MNT_WAIT || rebooting) { if ((error = softdep_flushworklist(ump->um_mountp, &count, td))) allerror = error; if (ffs_fsfail_cleanup(ump, allerror)) allerror = 0; /* Flushed work items may create new vnodes to clean */ if (allerror == 0 && count) goto loop; } devvp = ump->um_devvp; bo = &devvp->v_bufobj; BO_LOCK(bo); if (bo->bo_numoutput > 0 || bo->bo_dirty.bv_cnt > 0) { BO_UNLOCK(bo); vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY); error = VOP_FSYNC(devvp, waitfor, td); VOP_UNLOCK(devvp); if (MOUNTEDSOFTDEP(mp) && (error == 0 || error == EAGAIN)) error = ffs_sbupdate(ump, waitfor, 0); if (error != 0) allerror = error; if (ffs_fsfail_cleanup(ump, allerror)) allerror = 0; if (allerror == 0 && waitfor == MNT_WAIT) goto loop; } else if (suspend != 0) { if (softdep_check_suspend(mp, devvp, softdep_deps, softdep_accdeps, secondary_writes, secondary_accwrites) != 0) { MNT_IUNLOCK(mp); goto loop; /* More work needed */ } mtx_assert(MNT_MTX(mp), MA_OWNED); mp->mnt_kern_flag |= MNTK_SUSPEND2 | MNTK_SUSPENDED; MNT_IUNLOCK(mp); suspended = 1; } else BO_UNLOCK(bo); /* * Write back modified superblock. */ if (fs->fs_fmod != 0 && (error = ffs_sbupdate(ump, waitfor, suspended)) != 0) allerror = error; if (ffs_fsfail_cleanup(ump, allerror)) allerror = 0; return (allerror); } int ffs_vget(mp, ino, flags, vpp) struct mount *mp; ino_t ino; int flags; struct vnode **vpp; { return (ffs_vgetf(mp, ino, flags, vpp, 0)); } int ffs_vgetf(mp, ino, flags, vpp, ffs_flags) struct mount *mp; ino_t ino; int flags; struct vnode **vpp; int ffs_flags; { struct fs *fs; struct inode *ip; struct ufsmount *ump; struct buf *bp; struct vnode *vp; daddr_t dbn; int error; MPASS((ffs_flags & FFSV_REPLACE) == 0 || (flags & LK_EXCLUSIVE) != 0); error = vfs_hash_get(mp, ino, flags, curthread, vpp, NULL, NULL); if (error != 0) return (error); if (*vpp != NULL) { if ((ffs_flags & FFSV_REPLACE) == 0) return (0); vgone(*vpp); vput(*vpp); } /* * We must promote to an exclusive lock for vnode creation. This * can happen if lookup is passed LOCKSHARED. */ if ((flags & LK_TYPE_MASK) == LK_SHARED) { flags &= ~LK_TYPE_MASK; flags |= LK_EXCLUSIVE; } /* * We do not lock vnode creation as it is believed to be too * expensive for such rare case as simultaneous creation of vnode * for same ino by different processes. We just allow them to race * and check later to decide who wins. Let the race begin! */ ump = VFSTOUFS(mp); fs = ump->um_fs; ip = uma_zalloc_smr(uma_inode, M_WAITOK | M_ZERO); /* Allocate a new vnode/inode. */ error = getnewvnode("ufs", mp, fs->fs_magic == FS_UFS1_MAGIC ? &ffs_vnodeops1 : &ffs_vnodeops2, &vp); if (error) { *vpp = NULL; uma_zfree_smr(uma_inode, ip); return (error); } /* * FFS supports recursive locking. */ lockmgr(vp->v_vnlock, LK_EXCLUSIVE, NULL); VN_LOCK_AREC(vp); vp->v_data = ip; vp->v_bufobj.bo_bsize = fs->fs_bsize; ip->i_vnode = vp; ip->i_ump = ump; ip->i_number = ino; ip->i_ea_refs = 0; ip->i_nextclustercg = -1; ip->i_flag = fs->fs_magic == FS_UFS1_MAGIC ? 0 : IN_UFS2; ip->i_mode = 0; /* ensure error cases below throw away vnode */ #ifdef QUOTA { int i; for (i = 0; i < MAXQUOTAS; i++) ip->i_dquot[i] = NODQUOT; } #endif if (ffs_flags & FFSV_FORCEINSMQ) vp->v_vflag |= VV_FORCEINSMQ; error = insmntque(vp, mp); if (error != 0) { uma_zfree_smr(uma_inode, ip); *vpp = NULL; return (error); } vp->v_vflag &= ~VV_FORCEINSMQ; error = vfs_hash_insert(vp, ino, flags, curthread, vpp, NULL, NULL); if (error != 0) return (error); if (*vpp != NULL) { /* * Calls from ffs_valloc() (i.e. FFSV_REPLACE set) * operate on empty inode, which must not be found by * other threads until fully filled. Vnode for empty * inode must be not re-inserted on the hash by other * thread, after removal by us at the beginning. */ MPASS((ffs_flags & FFSV_REPLACE) == 0); return (0); } /* Read in the disk contents for the inode, copy into the inode. */ dbn = fsbtodb(fs, ino_to_fsba(fs, ino)); error = ffs_breadz(ump, ump->um_devvp, dbn, dbn, (int)fs->fs_bsize, NULL, NULL, 0, NOCRED, 0, NULL, &bp); if (error != 0) { /* * The inode does not contain anything useful, so it would * be misleading to leave it on its hash chain. With mode * still zero, it will be unlinked and returned to the free * list by vput(). */ vgone(vp); vput(vp); *vpp = NULL; return (error); } if (I_IS_UFS1(ip)) ip->i_din1 = uma_zalloc(uma_ufs1, M_WAITOK); else ip->i_din2 = uma_zalloc(uma_ufs2, M_WAITOK); if ((error = ffs_load_inode(bp, ip, fs, ino)) != 0) { bqrelse(bp); vgone(vp); vput(vp); *vpp = NULL; return (error); } if (DOINGSOFTDEP(vp)) softdep_load_inodeblock(ip); else ip->i_effnlink = ip->i_nlink; bqrelse(bp); /* * Initialize the vnode from the inode, check for aliases. * Note that the underlying vnode may have changed. */ error = ufs_vinit(mp, I_IS_UFS1(ip) ? &ffs_fifoops1 : &ffs_fifoops2, &vp); if (error) { vgone(vp); vput(vp); *vpp = NULL; return (error); } /* * Finish inode initialization. */ if (vp->v_type != VFIFO) { /* FFS supports shared locking for all files except fifos. */ VN_LOCK_ASHARE(vp); } /* * Set up a generation number for this inode if it does not * already have one. This should only happen on old filesystems. */ if (ip->i_gen == 0) { while (ip->i_gen == 0) ip->i_gen = arc4random(); if ((vp->v_mount->mnt_flag & MNT_RDONLY) == 0) { UFS_INODE_SET_FLAG(ip, IN_MODIFIED); DIP_SET(ip, i_gen, ip->i_gen); } } #ifdef MAC if ((mp->mnt_flag & MNT_MULTILABEL) && ip->i_mode) { /* * If this vnode is already allocated, and we're running * multi-label, attempt to perform a label association * from the extended attributes on the inode. */ error = mac_vnode_associate_extattr(mp, vp); if (error) { /* ufs_inactive will release ip->i_devvp ref. */ vgone(vp); vput(vp); *vpp = NULL; return (error); } } #endif *vpp = vp; return (0); } /* * File handle to vnode * * Have to be really careful about stale file handles: * - check that the inode number is valid * - for UFS2 check that the inode number is initialized * - call ffs_vget() to get the locked inode * - check for an unallocated inode (i_mode == 0) * - check that the given client host has export rights and return * those rights via. exflagsp and credanonp */ static int ffs_fhtovp(mp, fhp, flags, vpp) struct mount *mp; struct fid *fhp; int flags; struct vnode **vpp; { struct ufid *ufhp; struct ufsmount *ump; struct fs *fs; struct cg *cgp; struct buf *bp; ino_t ino; u_int cg; int error; ufhp = (struct ufid *)fhp; ino = ufhp->ufid_ino; ump = VFSTOUFS(mp); fs = ump->um_fs; if (ino < UFS_ROOTINO || ino >= fs->fs_ncg * fs->fs_ipg) return (ESTALE); /* * Need to check if inode is initialized because UFS2 does lazy * initialization and nfs_fhtovp can offer arbitrary inode numbers. */ if (fs->fs_magic != FS_UFS2_MAGIC) return (ufs_fhtovp(mp, ufhp, flags, vpp)); cg = ino_to_cg(fs, ino); if ((error = ffs_getcg(fs, ump->um_devvp, cg, 0, &bp, &cgp)) != 0) return (error); if (ino >= cg * fs->fs_ipg + cgp->cg_initediblk) { brelse(bp); return (ESTALE); } brelse(bp); return (ufs_fhtovp(mp, ufhp, flags, vpp)); } /* * Initialize the filesystem. */ static int ffs_init(vfsp) struct vfsconf *vfsp; { ffs_susp_initialize(); softdep_initialize(); return (ufs_init(vfsp)); } /* * Undo the work of ffs_init(). */ static int ffs_uninit(vfsp) struct vfsconf *vfsp; { int ret; ret = ufs_uninit(vfsp); softdep_uninitialize(); ffs_susp_uninitialize(); taskqueue_drain_all(taskqueue_thread); return (ret); } /* * Structure used to pass information from ffs_sbupdate to its * helper routine ffs_use_bwrite. */ struct devfd { struct ufsmount *ump; struct buf *sbbp; int waitfor; int suspended; int error; }; /* * Write a superblock and associated information back to disk. */ int ffs_sbupdate(ump, waitfor, suspended) struct ufsmount *ump; int waitfor; int suspended; { struct fs *fs; struct buf *sbbp; struct devfd devfd; fs = ump->um_fs; if (fs->fs_ronly == 1 && (ump->um_mountp->mnt_flag & (MNT_RDONLY | MNT_UPDATE)) != (MNT_RDONLY | MNT_UPDATE) && ump->um_fsckpid == 0) panic("ffs_sbupdate: write read-only filesystem"); /* * We use the superblock's buf to serialize calls to ffs_sbupdate(). */ sbbp = getblk(ump->um_devvp, btodb(fs->fs_sblockloc), (int)fs->fs_sbsize, 0, 0, 0); /* * Initialize info needed for write function. */ devfd.ump = ump; devfd.sbbp = sbbp; devfd.waitfor = waitfor; devfd.suspended = suspended; devfd.error = 0; return (ffs_sbput(&devfd, fs, fs->fs_sblockloc, ffs_use_bwrite)); } /* * Write function for use by filesystem-layer routines. */ static int ffs_use_bwrite(void *devfd, off_t loc, void *buf, int size) { struct devfd *devfdp; struct ufsmount *ump; struct buf *bp; struct fs *fs; int error; devfdp = devfd; ump = devfdp->ump; fs = ump->um_fs; /* * Writing the superblock summary information. */ if (loc != fs->fs_sblockloc) { bp = getblk(ump->um_devvp, btodb(loc), size, 0, 0, 0); bcopy(buf, bp->b_data, (u_int)size); if (devfdp->suspended) bp->b_flags |= B_VALIDSUSPWRT; if (devfdp->waitfor != MNT_WAIT) bawrite(bp); else if ((error = bwrite(bp)) != 0) devfdp->error = error; return (0); } /* * Writing the superblock itself. We need to do special checks for it. */ bp = devfdp->sbbp; if (ffs_fsfail_cleanup(ump, devfdp->error)) devfdp->error = 0; if (devfdp->error != 0) { brelse(bp); return (devfdp->error); } if (fs->fs_magic == FS_UFS1_MAGIC && fs->fs_sblockloc != SBLOCK_UFS1 && (fs->fs_old_flags & FS_FLAGS_UPDATED) == 0) { printf("WARNING: %s: correcting fs_sblockloc from %jd to %d\n", fs->fs_fsmnt, fs->fs_sblockloc, SBLOCK_UFS1); fs->fs_sblockloc = SBLOCK_UFS1; } if (fs->fs_magic == FS_UFS2_MAGIC && fs->fs_sblockloc != SBLOCK_UFS2 && (fs->fs_old_flags & FS_FLAGS_UPDATED) == 0) { printf("WARNING: %s: correcting fs_sblockloc from %jd to %d\n", fs->fs_fsmnt, fs->fs_sblockloc, SBLOCK_UFS2); fs->fs_sblockloc = SBLOCK_UFS2; } if (MOUNTEDSOFTDEP(ump->um_mountp)) softdep_setup_sbupdate(ump, (struct fs *)bp->b_data, bp); bcopy((caddr_t)fs, bp->b_data, (u_int)fs->fs_sbsize); fs = (struct fs *)bp->b_data; ffs_oldfscompat_write(fs, ump); fs->fs_si = NULL; /* Recalculate the superblock hash */ fs->fs_ckhash = ffs_calc_sbhash(fs); if (devfdp->suspended) bp->b_flags |= B_VALIDSUSPWRT; if (devfdp->waitfor != MNT_WAIT) bawrite(bp); else if ((error = bwrite(bp)) != 0) devfdp->error = error; return (devfdp->error); } static int ffs_extattrctl(struct mount *mp, int cmd, struct vnode *filename_vp, int attrnamespace, const char *attrname) { #ifdef UFS_EXTATTR return (ufs_extattrctl(mp, cmd, filename_vp, attrnamespace, attrname)); #else return (vfs_stdextattrctl(mp, cmd, filename_vp, attrnamespace, attrname)); #endif } static void ffs_ifree(struct ufsmount *ump, struct inode *ip) { if (ump->um_fstype == UFS1 && ip->i_din1 != NULL) uma_zfree(uma_ufs1, ip->i_din1); else if (ip->i_din2 != NULL) uma_zfree(uma_ufs2, ip->i_din2); uma_zfree_smr(uma_inode, ip); } static int dobkgrdwrite = 1; SYSCTL_INT(_debug, OID_AUTO, dobkgrdwrite, CTLFLAG_RW, &dobkgrdwrite, 0, "Do background writes (honoring the BV_BKGRDWRITE flag)?"); /* * Complete a background write started from bwrite. */ static void ffs_backgroundwritedone(struct buf *bp) { struct bufobj *bufobj; struct buf *origbp; #ifdef SOFTUPDATES if (!LIST_EMPTY(&bp->b_dep) && (bp->b_ioflags & BIO_ERROR) != 0) softdep_handle_error(bp); #endif /* * Find the original buffer that we are writing. */ bufobj = bp->b_bufobj; BO_LOCK(bufobj); if ((origbp = gbincore(bp->b_bufobj, bp->b_lblkno)) == NULL) panic("backgroundwritedone: lost buffer"); /* * We should mark the cylinder group buffer origbp as * dirty, to not lose the failed write. */ if ((bp->b_ioflags & BIO_ERROR) != 0) origbp->b_vflags |= BV_BKGRDERR; BO_UNLOCK(bufobj); /* * Process dependencies then return any unfinished ones. */ if (!LIST_EMPTY(&bp->b_dep) && (bp->b_ioflags & BIO_ERROR) == 0) buf_complete(bp); #ifdef SOFTUPDATES if (!LIST_EMPTY(&bp->b_dep)) softdep_move_dependencies(bp, origbp); #endif /* * This buffer is marked B_NOCACHE so when it is released * by biodone it will be tossed. */ bp->b_flags |= B_NOCACHE; bp->b_flags &= ~B_CACHE; pbrelvp(bp); /* * Prevent brelse() from trying to keep and re-dirtying bp on * errors. It causes b_bufobj dereference in * bdirty()/reassignbuf(), and b_bufobj was cleared in * pbrelvp() above. */ if ((bp->b_ioflags & BIO_ERROR) != 0) bp->b_flags |= B_INVAL; bufdone(bp); BO_LOCK(bufobj); /* * Clear the BV_BKGRDINPROG flag in the original buffer * and awaken it if it is waiting for the write to complete. * If BV_BKGRDINPROG is not set in the original buffer it must * have been released and re-instantiated - which is not legal. */ KASSERT((origbp->b_vflags & BV_BKGRDINPROG), ("backgroundwritedone: lost buffer2")); origbp->b_vflags &= ~BV_BKGRDINPROG; if (origbp->b_vflags & BV_BKGRDWAIT) { origbp->b_vflags &= ~BV_BKGRDWAIT; wakeup(&origbp->b_xflags); } BO_UNLOCK(bufobj); } /* * Write, release buffer on completion. (Done by iodone * if async). Do not bother writing anything if the buffer * is invalid. * * Note that we set B_CACHE here, indicating that buffer is * fully valid and thus cacheable. This is true even of NFS * now so we set it generally. This could be set either here * or in biodone() since the I/O is synchronous. We put it * here. */ static int ffs_bufwrite(struct buf *bp) { struct buf *newbp; struct cg *cgp; CTR3(KTR_BUF, "bufwrite(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); if (bp->b_flags & B_INVAL) { brelse(bp); return (0); } if (!BUF_ISLOCKED(bp)) panic("bufwrite: buffer is not busy???"); /* * If a background write is already in progress, delay * writing this block if it is asynchronous. Otherwise * wait for the background write to complete. */ BO_LOCK(bp->b_bufobj); if (bp->b_vflags & BV_BKGRDINPROG) { if (bp->b_flags & B_ASYNC) { BO_UNLOCK(bp->b_bufobj); bdwrite(bp); return (0); } bp->b_vflags |= BV_BKGRDWAIT; msleep(&bp->b_xflags, BO_LOCKPTR(bp->b_bufobj), PRIBIO, "bwrbg", 0); if (bp->b_vflags & BV_BKGRDINPROG) panic("bufwrite: still writing"); } bp->b_vflags &= ~BV_BKGRDERR; BO_UNLOCK(bp->b_bufobj); /* * If this buffer is marked for background writing and we * do not have to wait for it, make a copy and write the * copy so as to leave this buffer ready for further use. * * This optimization eats a lot of memory. If we have a page * or buffer shortfall we can't do it. */ if (dobkgrdwrite && (bp->b_xflags & BX_BKGRDWRITE) && (bp->b_flags & B_ASYNC) && !vm_page_count_severe() && !buf_dirty_count_severe()) { KASSERT(bp->b_iodone == NULL, ("bufwrite: needs chained iodone (%p)", bp->b_iodone)); /* get a new block */ newbp = geteblk(bp->b_bufsize, GB_NOWAIT_BD); if (newbp == NULL) goto normal_write; KASSERT(buf_mapped(bp), ("Unmapped cg")); memcpy(newbp->b_data, bp->b_data, bp->b_bufsize); BO_LOCK(bp->b_bufobj); bp->b_vflags |= BV_BKGRDINPROG; BO_UNLOCK(bp->b_bufobj); newbp->b_xflags |= (bp->b_xflags & BX_FSPRIV) | BX_BKGRDMARKER; newbp->b_lblkno = bp->b_lblkno; newbp->b_blkno = bp->b_blkno; newbp->b_offset = bp->b_offset; newbp->b_iodone = ffs_backgroundwritedone; newbp->b_flags |= B_ASYNC; newbp->b_flags &= ~B_INVAL; pbgetvp(bp->b_vp, newbp); #ifdef SOFTUPDATES /* * Move over the dependencies. If there are rollbacks, * leave the parent buffer dirtied as it will need to * be written again. */ if (LIST_EMPTY(&bp->b_dep) || softdep_move_dependencies(bp, newbp) == 0) bundirty(bp); #else bundirty(bp); #endif /* * Initiate write on the copy, release the original. The * BKGRDINPROG flag prevents it from going away until * the background write completes. We have to recalculate * its check hash in case the buffer gets freed and then * reconstituted from the buffer cache during a later read. */ if ((bp->b_xflags & BX_CYLGRP) != 0) { cgp = (struct cg *)bp->b_data; cgp->cg_ckhash = 0; cgp->cg_ckhash = calculate_crc32c(~0L, bp->b_data, bp->b_bcount); } bqrelse(bp); bp = newbp; } else /* Mark the buffer clean */ bundirty(bp); /* Let the normal bufwrite do the rest for us */ normal_write: /* * If we are writing a cylinder group, update its time. */ if ((bp->b_xflags & BX_CYLGRP) != 0) { cgp = (struct cg *)bp->b_data; cgp->cg_old_time = cgp->cg_time = time_second; } return (bufwrite(bp)); } static void ffs_geom_strategy(struct bufobj *bo, struct buf *bp) { struct vnode *vp; struct buf *tbp; int error, nocopy; /* * This is the bufobj strategy for the private VCHR vnodes * used by FFS to access the underlying storage device. * We override the default bufobj strategy and thus bypass * VOP_STRATEGY() for these vnodes. */ vp = bo2vnode(bo); KASSERT(bp->b_vp == NULL || bp->b_vp->v_type != VCHR || bp->b_vp->v_rdev == NULL || bp->b_vp->v_rdev->si_mountpt == NULL || VFSTOUFS(bp->b_vp->v_rdev->si_mountpt) == NULL || vp == VFSTOUFS(bp->b_vp->v_rdev->si_mountpt)->um_devvp, ("ffs_geom_strategy() with wrong vp")); if (bp->b_iocmd == BIO_WRITE) { if ((bp->b_flags & B_VALIDSUSPWRT) == 0 && bp->b_vp != NULL && bp->b_vp->v_mount != NULL && (bp->b_vp->v_mount->mnt_kern_flag & MNTK_SUSPENDED) != 0) panic("ffs_geom_strategy: bad I/O"); nocopy = bp->b_flags & B_NOCOPY; bp->b_flags &= ~(B_VALIDSUSPWRT | B_NOCOPY); if ((vp->v_vflag & VV_COPYONWRITE) && nocopy == 0 && vp->v_rdev->si_snapdata != NULL) { if ((bp->b_flags & B_CLUSTER) != 0) { runningbufwakeup(bp); TAILQ_FOREACH(tbp, &bp->b_cluster.cluster_head, b_cluster.cluster_entry) { error = ffs_copyonwrite(vp, tbp); if (error != 0 && error != EOPNOTSUPP) { bp->b_error = error; bp->b_ioflags |= BIO_ERROR; bufdone(bp); return; } } bp->b_runningbufspace = bp->b_bufsize; atomic_add_long(&runningbufspace, bp->b_runningbufspace); } else { error = ffs_copyonwrite(vp, bp); if (error != 0 && error != EOPNOTSUPP) { bp->b_error = error; bp->b_ioflags |= BIO_ERROR; bufdone(bp); return; } } } #ifdef SOFTUPDATES if ((bp->b_flags & B_CLUSTER) != 0) { TAILQ_FOREACH(tbp, &bp->b_cluster.cluster_head, b_cluster.cluster_entry) { if (!LIST_EMPTY(&tbp->b_dep)) buf_start(tbp); } } else { if (!LIST_EMPTY(&bp->b_dep)) buf_start(bp); } #endif /* * Check for metadata that needs check-hashes and update them. */ switch (bp->b_xflags & BX_FSPRIV) { case BX_CYLGRP: ((struct cg *)bp->b_data)->cg_ckhash = 0; ((struct cg *)bp->b_data)->cg_ckhash = calculate_crc32c(~0L, bp->b_data, bp->b_bcount); break; case BX_SUPERBLOCK: case BX_INODE: case BX_INDIR: case BX_DIR: printf("Check-hash write is unimplemented!!!\n"); break; case 0: break; default: printf("multiple buffer types 0x%b\n", (u_int)(bp->b_xflags & BX_FSPRIV), PRINT_UFS_BUF_XFLAGS); break; } } if (bp->b_iocmd != BIO_READ && ffs_enxio_enable) bp->b_xflags |= BX_CVTENXIO; g_vfs_strategy(bo, bp); } int ffs_own_mount(const struct mount *mp) { if (mp->mnt_op == &ufs_vfsops) return (1); return (0); } #ifdef DDB #ifdef SOFTUPDATES /* defined in ffs_softdep.c */ extern void db_print_ffs(struct ufsmount *ump); DB_SHOW_COMMAND(ffs, db_show_ffs) { struct mount *mp; struct ufsmount *ump; if (have_addr) { ump = VFSTOUFS((struct mount *)addr); db_print_ffs(ump); return; } TAILQ_FOREACH(mp, &mountlist, mnt_list) { if (!strcmp(mp->mnt_stat.f_fstypename, ufs_vfsconf.vfc_name)) db_print_ffs(VFSTOUFS(mp)); } } #endif /* SOFTUPDATES */ #endif /* DDB */ Index: head/sys/vm/swap_pager.c =================================================================== --- head/sys/vm/swap_pager.c (revision 364371) +++ head/sys/vm/swap_pager.c (revision 364372) @@ -1,3103 +1,3103 @@ /*- * SPDX-License-Identifier: BSD-4-Clause * * Copyright (c) 1998 Matthew Dillon, * Copyright (c) 1994 John S. Dyson * Copyright (c) 1990 University of Utah. * Copyright (c) 1982, 1986, 1989, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * the Systems Programming Group of the University of Utah Computer * Science Department. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * New Swap System * Matthew Dillon * * Radix Bitmap 'blists'. * * - The new swapper uses the new radix bitmap code. This should scale * to arbitrarily small or arbitrarily large swap spaces and an almost * arbitrary degree of fragmentation. * * Features: * * - on the fly reallocation of swap during putpages. The new system * does not try to keep previously allocated swap blocks for dirty * pages. * * - on the fly deallocation of swap * * - No more garbage collection required. Unnecessarily allocated swap * blocks only exist for dirty vm_page_t's now and these are already * cycled (in a high-load system) by the pager. We also do on-the-fly * removal of invalidated swap blocks when a page is destroyed * or renamed. * * from: Utah $Hdr: swap_pager.c 1.4 91/04/30$ * * @(#)swap_pager.c 8.9 (Berkeley) 3/21/94 * @(#)vm_swap.c 8.5 (Berkeley) 2/17/94 */ #include __FBSDID("$FreeBSD$"); #include "opt_vm.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * MAX_PAGEOUT_CLUSTER must be a power of 2 between 1 and 64. * The 64-page limit is due to the radix code (kern/subr_blist.c). */ #ifndef MAX_PAGEOUT_CLUSTER #define MAX_PAGEOUT_CLUSTER 32 #endif #if !defined(SWB_NPAGES) #define SWB_NPAGES MAX_PAGEOUT_CLUSTER #endif #define SWAP_META_PAGES PCTRIE_COUNT /* * A swblk structure maps each page index within a * SWAP_META_PAGES-aligned and sized range to the address of an * on-disk swap block (or SWAPBLK_NONE). The collection of these * mappings for an entire vm object is implemented as a pc-trie. */ struct swblk { vm_pindex_t p; daddr_t d[SWAP_META_PAGES]; }; static MALLOC_DEFINE(M_VMPGDATA, "vm_pgdata", "swap pager private data"); static struct mtx sw_dev_mtx; static TAILQ_HEAD(, swdevt) swtailq = TAILQ_HEAD_INITIALIZER(swtailq); static struct swdevt *swdevhd; /* Allocate from here next */ static int nswapdev; /* Number of swap devices */ int swap_pager_avail; static struct sx swdev_syscall_lock; /* serialize swap(on|off) */ static __exclusive_cache_line u_long swap_reserved; static u_long swap_total; static int sysctl_page_shift(SYSCTL_HANDLER_ARGS); static SYSCTL_NODE(_vm_stats, OID_AUTO, swap, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "VM swap stats"); SYSCTL_PROC(_vm, OID_AUTO, swap_reserved, CTLTYPE_U64 | CTLFLAG_RD | CTLFLAG_MPSAFE, &swap_reserved, 0, sysctl_page_shift, "A", "Amount of swap storage needed to back all allocated anonymous memory."); SYSCTL_PROC(_vm, OID_AUTO, swap_total, CTLTYPE_U64 | CTLFLAG_RD | CTLFLAG_MPSAFE, &swap_total, 0, sysctl_page_shift, "A", "Total amount of available swap storage."); static int overcommit = 0; SYSCTL_INT(_vm, VM_OVERCOMMIT, overcommit, CTLFLAG_RW, &overcommit, 0, "Configure virtual memory overcommit behavior. See tuning(7) " "for details."); static unsigned long swzone; SYSCTL_ULONG(_vm, OID_AUTO, swzone, CTLFLAG_RD, &swzone, 0, "Actual size of swap metadata zone"); static unsigned long swap_maxpages; SYSCTL_ULONG(_vm, OID_AUTO, swap_maxpages, CTLFLAG_RD, &swap_maxpages, 0, "Maximum amount of swap supported"); static COUNTER_U64_DEFINE_EARLY(swap_free_deferred); SYSCTL_COUNTER_U64(_vm_stats_swap, OID_AUTO, free_deferred, CTLFLAG_RD, &swap_free_deferred, "Number of pages that deferred freeing swap space"); static COUNTER_U64_DEFINE_EARLY(swap_free_completed); SYSCTL_COUNTER_U64(_vm_stats_swap, OID_AUTO, free_completed, CTLFLAG_RD, &swap_free_completed, "Number of deferred frees completed"); /* bits from overcommit */ #define SWAP_RESERVE_FORCE_ON (1 << 0) #define SWAP_RESERVE_RLIMIT_ON (1 << 1) #define SWAP_RESERVE_ALLOW_NONWIRED (1 << 2) static int sysctl_page_shift(SYSCTL_HANDLER_ARGS) { uint64_t newval; u_long value = *(u_long *)arg1; newval = ((uint64_t)value) << PAGE_SHIFT; return (sysctl_handle_64(oidp, &newval, 0, req)); } static bool swap_reserve_by_cred_rlimit(u_long pincr, struct ucred *cred, int oc) { struct uidinfo *uip; u_long prev; uip = cred->cr_ruidinfo; prev = atomic_fetchadd_long(&uip->ui_vmsize, pincr); if ((oc & SWAP_RESERVE_RLIMIT_ON) != 0 && prev + pincr > lim_cur(curthread, RLIMIT_SWAP) && priv_check(curthread, PRIV_VM_SWAP_NORLIMIT) != 0) { prev = atomic_fetchadd_long(&uip->ui_vmsize, -pincr); KASSERT(prev >= pincr, ("negative vmsize for uid = %d\n", uip->ui_uid)); return (false); } return (true); } static void swap_release_by_cred_rlimit(u_long pdecr, struct ucred *cred) { struct uidinfo *uip; #ifdef INVARIANTS u_long prev; #endif uip = cred->cr_ruidinfo; #ifdef INVARIANTS prev = atomic_fetchadd_long(&uip->ui_vmsize, -pdecr); KASSERT(prev >= pdecr, ("negative vmsize for uid = %d\n", uip->ui_uid)); #else atomic_subtract_long(&uip->ui_vmsize, pdecr); #endif } static void swap_reserve_force_rlimit(u_long pincr, struct ucred *cred) { struct uidinfo *uip; uip = cred->cr_ruidinfo; atomic_add_long(&uip->ui_vmsize, pincr); } bool swap_reserve(vm_ooffset_t incr) { return (swap_reserve_by_cred(incr, curthread->td_ucred)); } bool swap_reserve_by_cred(vm_ooffset_t incr, struct ucred *cred) { u_long r, s, prev, pincr; #ifdef RACCT int error; #endif int oc; static int curfail; static struct timeval lastfail; KASSERT((incr & PAGE_MASK) == 0, ("%s: incr: %ju & PAGE_MASK", __func__, (uintmax_t)incr)); #ifdef RACCT if (RACCT_ENABLED()) { PROC_LOCK(curproc); error = racct_add(curproc, RACCT_SWAP, incr); PROC_UNLOCK(curproc); if (error != 0) return (false); } #endif pincr = atop(incr); prev = atomic_fetchadd_long(&swap_reserved, pincr); r = prev + pincr; s = swap_total; oc = atomic_load_int(&overcommit); if (r > s && (oc & SWAP_RESERVE_ALLOW_NONWIRED) != 0) { s += vm_cnt.v_page_count - vm_cnt.v_free_reserved - vm_wire_count(); } if ((oc & SWAP_RESERVE_FORCE_ON) != 0 && r > s && priv_check(curthread, PRIV_VM_SWAP_NOQUOTA) != 0) { prev = atomic_fetchadd_long(&swap_reserved, -pincr); KASSERT(prev >= pincr, ("swap_reserved < incr on overcommit fail")); goto out_error; } if (!swap_reserve_by_cred_rlimit(pincr, cred, oc)) { prev = atomic_fetchadd_long(&swap_reserved, -pincr); KASSERT(prev >= pincr, ("swap_reserved < incr on overcommit fail")); goto out_error; } return (true); out_error: if (ppsratecheck(&lastfail, &curfail, 1)) { printf("uid %d, pid %d: swap reservation for %jd bytes failed\n", cred->cr_ruidinfo->ui_uid, curproc->p_pid, incr); } #ifdef RACCT if (RACCT_ENABLED()) { PROC_LOCK(curproc); racct_sub(curproc, RACCT_SWAP, incr); PROC_UNLOCK(curproc); } #endif return (false); } void swap_reserve_force(vm_ooffset_t incr) { u_long pincr; KASSERT((incr & PAGE_MASK) == 0, ("%s: incr: %ju & PAGE_MASK", __func__, (uintmax_t)incr)); #ifdef RACCT if (RACCT_ENABLED()) { PROC_LOCK(curproc); racct_add_force(curproc, RACCT_SWAP, incr); PROC_UNLOCK(curproc); } #endif pincr = atop(incr); atomic_add_long(&swap_reserved, pincr); swap_reserve_force_rlimit(pincr, curthread->td_ucred); } void swap_release(vm_ooffset_t decr) { struct ucred *cred; PROC_LOCK(curproc); cred = curproc->p_ucred; swap_release_by_cred(decr, cred); PROC_UNLOCK(curproc); } void swap_release_by_cred(vm_ooffset_t decr, struct ucred *cred) { u_long pdecr; #ifdef INVARIANTS u_long prev; #endif KASSERT((decr & PAGE_MASK) == 0, ("%s: decr: %ju & PAGE_MASK", __func__, (uintmax_t)decr)); pdecr = atop(decr); #ifdef INVARIANTS prev = atomic_fetchadd_long(&swap_reserved, -pdecr); KASSERT(prev >= pdecr, ("swap_reserved < decr")); #else atomic_subtract_long(&swap_reserved, pdecr); #endif swap_release_by_cred_rlimit(pdecr, cred); #ifdef RACCT if (racct_enable) racct_sub_cred(cred, RACCT_SWAP, decr); #endif } static int swap_pager_full = 2; /* swap space exhaustion (task killing) */ static int swap_pager_almost_full = 1; /* swap space exhaustion (w/hysteresis)*/ static struct mtx swbuf_mtx; /* to sync nsw_wcount_async */ static int nsw_wcount_async; /* limit async write buffers */ static int nsw_wcount_async_max;/* assigned maximum */ static int nsw_cluster_max; /* maximum VOP I/O allowed */ static int sysctl_swap_async_max(SYSCTL_HANDLER_ARGS); SYSCTL_PROC(_vm, OID_AUTO, swap_async_max, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, 0, sysctl_swap_async_max, "I", "Maximum running async swap ops"); static int sysctl_swap_fragmentation(SYSCTL_HANDLER_ARGS); SYSCTL_PROC(_vm, OID_AUTO, swap_fragmentation, CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, sysctl_swap_fragmentation, "A", "Swap Fragmentation Info"); static struct sx sw_alloc_sx; /* * "named" and "unnamed" anon region objects. Try to reduce the overhead * of searching a named list by hashing it just a little. */ #define NOBJLISTS 8 #define NOBJLIST(handle) \ (&swap_pager_object_list[((int)(intptr_t)handle >> 4) & (NOBJLISTS-1)]) static struct pagerlst swap_pager_object_list[NOBJLISTS]; static uma_zone_t swwbuf_zone; static uma_zone_t swrbuf_zone; static uma_zone_t swblk_zone; static uma_zone_t swpctrie_zone; /* * pagerops for OBJT_SWAP - "swap pager". Some ops are also global procedure * calls hooked from other parts of the VM system and do not appear here. * (see vm/swap_pager.h). */ static vm_object_t swap_pager_alloc(void *handle, vm_ooffset_t size, vm_prot_t prot, vm_ooffset_t offset, struct ucred *); static void swap_pager_dealloc(vm_object_t object); static int swap_pager_getpages(vm_object_t, vm_page_t *, int, int *, int *); static int swap_pager_getpages_async(vm_object_t, vm_page_t *, int, int *, int *, pgo_getpages_iodone_t, void *); static void swap_pager_putpages(vm_object_t, vm_page_t *, int, boolean_t, int *); static boolean_t swap_pager_haspage(vm_object_t object, vm_pindex_t pindex, int *before, int *after); static void swap_pager_init(void); static void swap_pager_unswapped(vm_page_t); static void swap_pager_swapoff(struct swdevt *sp); static void swap_pager_update_writecount(vm_object_t object, vm_offset_t start, vm_offset_t end); static void swap_pager_release_writecount(vm_object_t object, vm_offset_t start, vm_offset_t end); struct pagerops swappagerops = { .pgo_init = swap_pager_init, /* early system initialization of pager */ .pgo_alloc = swap_pager_alloc, /* allocate an OBJT_SWAP object */ .pgo_dealloc = swap_pager_dealloc, /* deallocate an OBJT_SWAP object */ .pgo_getpages = swap_pager_getpages, /* pagein */ .pgo_getpages_async = swap_pager_getpages_async, /* pagein (async) */ .pgo_putpages = swap_pager_putpages, /* pageout */ .pgo_haspage = swap_pager_haspage, /* get backing store status for page */ .pgo_pageunswapped = swap_pager_unswapped, /* remove swap related to page */ .pgo_update_writecount = swap_pager_update_writecount, .pgo_release_writecount = swap_pager_release_writecount, }; /* * swap_*() routines are externally accessible. swp_*() routines are * internal. */ static int nswap_lowat = 128; /* in pages, swap_pager_almost_full warn */ static int nswap_hiwat = 512; /* in pages, swap_pager_almost_full warn */ SYSCTL_INT(_vm, OID_AUTO, dmmax, CTLFLAG_RD, &nsw_cluster_max, 0, "Maximum size of a swap block in pages"); static void swp_sizecheck(void); static void swp_pager_async_iodone(struct buf *bp); static bool swp_pager_swblk_empty(struct swblk *sb, int start, int limit); static void swp_pager_free_empty_swblk(vm_object_t, struct swblk *sb); static int swapongeom(struct vnode *); static int swaponvp(struct thread *, struct vnode *, u_long); static int swapoff_one(struct swdevt *sp, struct ucred *cred); /* * Swap bitmap functions */ static void swp_pager_freeswapspace(daddr_t blk, daddr_t npages); static daddr_t swp_pager_getswapspace(int *npages); /* * Metadata functions */ static daddr_t swp_pager_meta_build(vm_object_t, vm_pindex_t, daddr_t); static void swp_pager_meta_free(vm_object_t, vm_pindex_t, vm_pindex_t); static void swp_pager_meta_transfer(vm_object_t src, vm_object_t dst, vm_pindex_t pindex, vm_pindex_t count); static void swp_pager_meta_free_all(vm_object_t); static daddr_t swp_pager_meta_lookup(vm_object_t, vm_pindex_t); static void swp_pager_init_freerange(daddr_t *start, daddr_t *num) { *start = SWAPBLK_NONE; *num = 0; } static void swp_pager_update_freerange(daddr_t *start, daddr_t *num, daddr_t addr) { if (*start + *num == addr) { (*num)++; } else { swp_pager_freeswapspace(*start, *num); *start = addr; *num = 1; } } static void * swblk_trie_alloc(struct pctrie *ptree) { return (uma_zalloc(swpctrie_zone, M_NOWAIT | (curproc == pageproc ? M_USE_RESERVE : 0))); } static void swblk_trie_free(struct pctrie *ptree, void *node) { uma_zfree(swpctrie_zone, node); } PCTRIE_DEFINE(SWAP, swblk, p, swblk_trie_alloc, swblk_trie_free); /* * SWP_SIZECHECK() - update swap_pager_full indication * * update the swap_pager_almost_full indication and warn when we are * about to run out of swap space, using lowat/hiwat hysteresis. * * Clear swap_pager_full ( task killing ) indication when lowat is met. * * No restrictions on call * This routine may not block. */ static void swp_sizecheck(void) { if (swap_pager_avail < nswap_lowat) { if (swap_pager_almost_full == 0) { printf("swap_pager: out of swap space\n"); swap_pager_almost_full = 1; } } else { swap_pager_full = 0; if (swap_pager_avail > nswap_hiwat) swap_pager_almost_full = 0; } } /* * SWAP_PAGER_INIT() - initialize the swap pager! * * Expected to be started from system init. NOTE: This code is run * before much else so be careful what you depend on. Most of the VM * system has yet to be initialized at this point. */ static void swap_pager_init(void) { /* * Initialize object lists */ int i; for (i = 0; i < NOBJLISTS; ++i) TAILQ_INIT(&swap_pager_object_list[i]); mtx_init(&sw_dev_mtx, "swapdev", NULL, MTX_DEF); sx_init(&sw_alloc_sx, "swspsx"); sx_init(&swdev_syscall_lock, "swsysc"); } /* * SWAP_PAGER_SWAP_INIT() - swap pager initialization from pageout process * * Expected to be started from pageout process once, prior to entering * its main loop. */ void swap_pager_swap_init(void) { unsigned long n, n2; /* * Number of in-transit swap bp operations. Don't * exhaust the pbufs completely. Make sure we * initialize workable values (0 will work for hysteresis * but it isn't very efficient). * * The nsw_cluster_max is constrained by the bp->b_pages[] * array, which has MAXPHYS / PAGE_SIZE entries, and our locally * defined MAX_PAGEOUT_CLUSTER. Also be aware that swap ops are * constrained by the swap device interleave stripe size. * * Currently we hardwire nsw_wcount_async to 4. This limit is * designed to prevent other I/O from having high latencies due to * our pageout I/O. The value 4 works well for one or two active swap * devices but is probably a little low if you have more. Even so, * a higher value would probably generate only a limited improvement * with three or four active swap devices since the system does not * typically have to pageout at extreme bandwidths. We will want * at least 2 per swap devices, and 4 is a pretty good value if you * have one NFS swap device due to the command/ack latency over NFS. * So it all works out pretty well. */ nsw_cluster_max = min(MAXPHYS / PAGE_SIZE, MAX_PAGEOUT_CLUSTER); nsw_wcount_async = 4; nsw_wcount_async_max = nsw_wcount_async; mtx_init(&swbuf_mtx, "async swbuf mutex", NULL, MTX_DEF); swwbuf_zone = pbuf_zsecond_create("swwbuf", nswbuf / 4); swrbuf_zone = pbuf_zsecond_create("swrbuf", nswbuf / 2); /* * Initialize our zone, taking the user's requested size or * estimating the number we need based on the number of pages * in the system. */ n = maxswzone != 0 ? maxswzone / sizeof(struct swblk) : vm_cnt.v_page_count / 2; swpctrie_zone = uma_zcreate("swpctrie", pctrie_node_size(), NULL, NULL, pctrie_zone_init, NULL, UMA_ALIGN_PTR, 0); if (swpctrie_zone == NULL) panic("failed to create swap pctrie zone."); swblk_zone = uma_zcreate("swblk", sizeof(struct swblk), NULL, NULL, NULL, NULL, _Alignof(struct swblk) - 1, 0); if (swblk_zone == NULL) panic("failed to create swap blk zone."); n2 = n; do { if (uma_zone_reserve_kva(swblk_zone, n)) break; /* * if the allocation failed, try a zone two thirds the * size of the previous attempt. */ n -= ((n + 2) / 3); } while (n > 0); /* * Often uma_zone_reserve_kva() cannot reserve exactly the * requested size. Account for the difference when * calculating swap_maxpages. */ n = uma_zone_get_max(swblk_zone); if (n < n2) printf("Swap blk zone entries changed from %lu to %lu.\n", n2, n); /* absolute maximum we can handle assuming 100% efficiency */ swap_maxpages = n * SWAP_META_PAGES; swzone = n * sizeof(struct swblk); if (!uma_zone_reserve_kva(swpctrie_zone, n)) printf("Cannot reserve swap pctrie zone, " "reduce kern.maxswzone.\n"); } static vm_object_t swap_pager_alloc_init(void *handle, struct ucred *cred, vm_ooffset_t size, vm_ooffset_t offset) { vm_object_t object; if (cred != NULL) { if (!swap_reserve_by_cred(size, cred)) return (NULL); crhold(cred); } /* * The un_pager.swp.swp_blks trie is initialized by * vm_object_allocate() to ensure the correct order of * visibility to other threads. */ object = vm_object_allocate(OBJT_SWAP, OFF_TO_IDX(offset + PAGE_MASK + size)); object->un_pager.swp.writemappings = 0; object->handle = handle; if (cred != NULL) { object->cred = cred; object->charge = size; } return (object); } /* * SWAP_PAGER_ALLOC() - allocate a new OBJT_SWAP VM object and instantiate * its metadata structures. * * This routine is called from the mmap and fork code to create a new * OBJT_SWAP object. * * This routine must ensure that no live duplicate is created for * the named object request, which is protected against by * holding the sw_alloc_sx lock in case handle != NULL. */ static vm_object_t swap_pager_alloc(void *handle, vm_ooffset_t size, vm_prot_t prot, vm_ooffset_t offset, struct ucred *cred) { vm_object_t object; if (handle != NULL) { /* * Reference existing named region or allocate new one. There * should not be a race here against swp_pager_meta_build() * as called from vm_page_remove() in regards to the lookup * of the handle. */ sx_xlock(&sw_alloc_sx); object = vm_pager_object_lookup(NOBJLIST(handle), handle); if (object == NULL) { object = swap_pager_alloc_init(handle, cred, size, offset); if (object != NULL) { TAILQ_INSERT_TAIL(NOBJLIST(object->handle), object, pager_object_list); } } sx_xunlock(&sw_alloc_sx); } else { object = swap_pager_alloc_init(handle, cred, size, offset); } return (object); } /* * SWAP_PAGER_DEALLOC() - remove swap metadata from object * * The swap backing for the object is destroyed. The code is * designed such that we can reinstantiate it later, but this * routine is typically called only when the entire object is * about to be destroyed. * * The object must be locked. */ static void swap_pager_dealloc(vm_object_t object) { VM_OBJECT_ASSERT_WLOCKED(object); KASSERT((object->flags & OBJ_DEAD) != 0, ("dealloc of reachable obj")); /* * Remove from list right away so lookups will fail if we block for * pageout completion. */ if ((object->flags & OBJ_ANON) == 0 && object->handle != NULL) { VM_OBJECT_WUNLOCK(object); sx_xlock(&sw_alloc_sx); TAILQ_REMOVE(NOBJLIST(object->handle), object, pager_object_list); sx_xunlock(&sw_alloc_sx); VM_OBJECT_WLOCK(object); } vm_object_pip_wait(object, "swpdea"); /* * Free all remaining metadata. We only bother to free it from * the swap meta data. We do not attempt to free swapblk's still * associated with vm_page_t's for this object. We do not care * if paging is still in progress on some objects. */ swp_pager_meta_free_all(object); object->handle = NULL; object->type = OBJT_DEAD; } /************************************************************************ * SWAP PAGER BITMAP ROUTINES * ************************************************************************/ /* * SWP_PAGER_GETSWAPSPACE() - allocate raw swap space * * Allocate swap for up to the requested number of pages. The * starting swap block number (a page index) is returned or * SWAPBLK_NONE if the allocation failed. * * Also has the side effect of advising that somebody made a mistake * when they configured swap and didn't configure enough. * * This routine may not sleep. * * We allocate in round-robin fashion from the configured devices. */ static daddr_t swp_pager_getswapspace(int *io_npages) { daddr_t blk; struct swdevt *sp; int mpages, npages; KASSERT(*io_npages >= 1, ("%s: npages not positive", __func__)); blk = SWAPBLK_NONE; mpages = *io_npages; npages = imin(BLIST_MAX_ALLOC, mpages); mtx_lock(&sw_dev_mtx); sp = swdevhd; while (!TAILQ_EMPTY(&swtailq)) { if (sp == NULL) sp = TAILQ_FIRST(&swtailq); if ((sp->sw_flags & SW_CLOSING) == 0) blk = blist_alloc(sp->sw_blist, &npages, mpages); if (blk != SWAPBLK_NONE) break; sp = TAILQ_NEXT(sp, sw_list); if (swdevhd == sp) { if (npages == 1) break; mpages = npages - 1; npages >>= 1; } } if (blk != SWAPBLK_NONE) { *io_npages = npages; blk += sp->sw_first; sp->sw_used += npages; swap_pager_avail -= npages; swp_sizecheck(); swdevhd = TAILQ_NEXT(sp, sw_list); } else { if (swap_pager_full != 2) { printf("swp_pager_getswapspace(%d): failed\n", *io_npages); swap_pager_full = 2; swap_pager_almost_full = 1; } swdevhd = NULL; } mtx_unlock(&sw_dev_mtx); return (blk); } static bool swp_pager_isondev(daddr_t blk, struct swdevt *sp) { return (blk >= sp->sw_first && blk < sp->sw_end); } static void swp_pager_strategy(struct buf *bp) { struct swdevt *sp; mtx_lock(&sw_dev_mtx); TAILQ_FOREACH(sp, &swtailq, sw_list) { if (swp_pager_isondev(bp->b_blkno, sp)) { mtx_unlock(&sw_dev_mtx); if ((sp->sw_flags & SW_UNMAPPED) != 0 && unmapped_buf_allowed) { bp->b_data = unmapped_buf; bp->b_offset = 0; } else { pmap_qenter((vm_offset_t)bp->b_data, &bp->b_pages[0], bp->b_bcount / PAGE_SIZE); } sp->sw_strategy(bp, sp); return; } } panic("Swapdev not found"); } /* * SWP_PAGER_FREESWAPSPACE() - free raw swap space * * This routine returns the specified swap blocks back to the bitmap. * * This routine may not sleep. */ static void swp_pager_freeswapspace(daddr_t blk, daddr_t npages) { struct swdevt *sp; if (npages == 0) return; mtx_lock(&sw_dev_mtx); TAILQ_FOREACH(sp, &swtailq, sw_list) { if (swp_pager_isondev(blk, sp)) { sp->sw_used -= npages; /* * If we are attempting to stop swapping on * this device, we don't want to mark any * blocks free lest they be reused. */ if ((sp->sw_flags & SW_CLOSING) == 0) { blist_free(sp->sw_blist, blk - sp->sw_first, npages); swap_pager_avail += npages; swp_sizecheck(); } mtx_unlock(&sw_dev_mtx); return; } } panic("Swapdev not found"); } /* * SYSCTL_SWAP_FRAGMENTATION() - produce raw swap space stats */ static int sysctl_swap_fragmentation(SYSCTL_HANDLER_ARGS) { struct sbuf sbuf; struct swdevt *sp; const char *devname; int error; error = sysctl_wire_old_buffer(req, 0); if (error != 0) return (error); sbuf_new_for_sysctl(&sbuf, NULL, 128, req); mtx_lock(&sw_dev_mtx); TAILQ_FOREACH(sp, &swtailq, sw_list) { - if (vn_isdisk(sp->sw_vp, NULL)) + if (vn_isdisk(sp->sw_vp)) devname = devtoname(sp->sw_vp->v_rdev); else devname = "[file]"; sbuf_printf(&sbuf, "\nFree space on device %s:\n", devname); blist_stats(sp->sw_blist, &sbuf); } mtx_unlock(&sw_dev_mtx); error = sbuf_finish(&sbuf); sbuf_delete(&sbuf); return (error); } /* * SWAP_PAGER_FREESPACE() - frees swap blocks associated with a page * range within an object. * * This is a globally accessible routine. * * This routine removes swapblk assignments from swap metadata. * * The external callers of this routine typically have already destroyed * or renamed vm_page_t's associated with this range in the object so * we should be ok. * * The object must be locked. */ void swap_pager_freespace(vm_object_t object, vm_pindex_t start, vm_size_t size) { swp_pager_meta_free(object, start, size); } /* * SWAP_PAGER_RESERVE() - reserve swap blocks in object * * Assigns swap blocks to the specified range within the object. The * swap blocks are not zeroed. Any previous swap assignment is destroyed. * * Returns 0 on success, -1 on failure. */ int swap_pager_reserve(vm_object_t object, vm_pindex_t start, vm_size_t size) { daddr_t addr, blk, n_free, s_free; int i, j, n; swp_pager_init_freerange(&s_free, &n_free); VM_OBJECT_WLOCK(object); for (i = 0; i < size; i += n) { n = size - i; blk = swp_pager_getswapspace(&n); if (blk == SWAPBLK_NONE) { swp_pager_meta_free(object, start, i); VM_OBJECT_WUNLOCK(object); return (-1); } for (j = 0; j < n; ++j) { addr = swp_pager_meta_build(object, start + i + j, blk + j); if (addr != SWAPBLK_NONE) swp_pager_update_freerange(&s_free, &n_free, addr); } } swp_pager_freeswapspace(s_free, n_free); VM_OBJECT_WUNLOCK(object); return (0); } static bool swp_pager_xfer_source(vm_object_t srcobject, vm_object_t dstobject, vm_pindex_t pindex, daddr_t addr) { daddr_t dstaddr; KASSERT(srcobject->type == OBJT_SWAP, ("%s: Srcobject not swappable", __func__)); if (dstobject->type == OBJT_SWAP && swp_pager_meta_lookup(dstobject, pindex) != SWAPBLK_NONE) { /* Caller should destroy the source block. */ return (false); } /* * Destination has no swapblk and is not resident, transfer source. * swp_pager_meta_build() can sleep. */ VM_OBJECT_WUNLOCK(srcobject); dstaddr = swp_pager_meta_build(dstobject, pindex, addr); KASSERT(dstaddr == SWAPBLK_NONE, ("Unexpected destination swapblk")); VM_OBJECT_WLOCK(srcobject); return (true); } /* * SWAP_PAGER_COPY() - copy blocks from source pager to destination pager * and destroy the source. * * Copy any valid swapblks from the source to the destination. In * cases where both the source and destination have a valid swapblk, * we keep the destination's. * * This routine is allowed to sleep. It may sleep allocating metadata * indirectly through swp_pager_meta_build(). * * The source object contains no vm_page_t's (which is just as well) * * The source object is of type OBJT_SWAP. * * The source and destination objects must be locked. * Both object locks may temporarily be released. */ void swap_pager_copy(vm_object_t srcobject, vm_object_t dstobject, vm_pindex_t offset, int destroysource) { VM_OBJECT_ASSERT_WLOCKED(srcobject); VM_OBJECT_ASSERT_WLOCKED(dstobject); /* * If destroysource is set, we remove the source object from the * swap_pager internal queue now. */ if (destroysource && (srcobject->flags & OBJ_ANON) == 0 && srcobject->handle != NULL) { VM_OBJECT_WUNLOCK(srcobject); VM_OBJECT_WUNLOCK(dstobject); sx_xlock(&sw_alloc_sx); TAILQ_REMOVE(NOBJLIST(srcobject->handle), srcobject, pager_object_list); sx_xunlock(&sw_alloc_sx); VM_OBJECT_WLOCK(dstobject); VM_OBJECT_WLOCK(srcobject); } /* * Transfer source to destination. */ swp_pager_meta_transfer(srcobject, dstobject, offset, dstobject->size); /* * Free left over swap blocks in source. * * We have to revert the type to OBJT_DEFAULT so we do not accidentally * double-remove the object from the swap queues. */ if (destroysource) { swp_pager_meta_free_all(srcobject); /* * Reverting the type is not necessary, the caller is going * to destroy srcobject directly, but I'm doing it here * for consistency since we've removed the object from its * queues. */ srcobject->type = OBJT_DEFAULT; } } /* * SWAP_PAGER_HASPAGE() - determine if we have good backing store for * the requested page. * * We determine whether good backing store exists for the requested * page and return TRUE if it does, FALSE if it doesn't. * * If TRUE, we also try to determine how much valid, contiguous backing * store exists before and after the requested page. */ static boolean_t swap_pager_haspage(vm_object_t object, vm_pindex_t pindex, int *before, int *after) { daddr_t blk, blk0; int i; VM_OBJECT_ASSERT_LOCKED(object); KASSERT(object->type == OBJT_SWAP, ("%s: object not swappable", __func__)); /* * do we have good backing store at the requested index ? */ blk0 = swp_pager_meta_lookup(object, pindex); if (blk0 == SWAPBLK_NONE) { if (before) *before = 0; if (after) *after = 0; return (FALSE); } /* * find backwards-looking contiguous good backing store */ if (before != NULL) { for (i = 1; i < SWB_NPAGES; i++) { if (i > pindex) break; blk = swp_pager_meta_lookup(object, pindex - i); if (blk != blk0 - i) break; } *before = i - 1; } /* * find forward-looking contiguous good backing store */ if (after != NULL) { for (i = 1; i < SWB_NPAGES; i++) { blk = swp_pager_meta_lookup(object, pindex + i); if (blk != blk0 + i) break; } *after = i - 1; } return (TRUE); } /* * SWAP_PAGER_PAGE_UNSWAPPED() - remove swap backing store related to page * * This removes any associated swap backing store, whether valid or * not, from the page. * * This routine is typically called when a page is made dirty, at * which point any associated swap can be freed. MADV_FREE also * calls us in a special-case situation * * NOTE!!! If the page is clean and the swap was valid, the caller * should make the page dirty before calling this routine. This routine * does NOT change the m->dirty status of the page. Also: MADV_FREE * depends on it. * * This routine may not sleep. * * The object containing the page may be locked. */ static void swap_pager_unswapped(vm_page_t m) { struct swblk *sb; vm_object_t obj; /* * Handle enqueing deferred frees first. If we do not have the * object lock we wait for the page daemon to clear the space. */ obj = m->object; if (!VM_OBJECT_WOWNED(obj)) { VM_PAGE_OBJECT_BUSY_ASSERT(m); /* * The caller is responsible for synchronization but we * will harmlessly handle races. This is typically provided * by only calling unswapped() when a page transitions from * clean to dirty. */ if ((m->a.flags & (PGA_SWAP_SPACE | PGA_SWAP_FREE)) == PGA_SWAP_SPACE) { vm_page_aflag_set(m, PGA_SWAP_FREE); counter_u64_add(swap_free_deferred, 1); } return; } if ((m->a.flags & PGA_SWAP_FREE) != 0) counter_u64_add(swap_free_completed, 1); vm_page_aflag_clear(m, PGA_SWAP_FREE | PGA_SWAP_SPACE); /* * The meta data only exists if the object is OBJT_SWAP * and even then might not be allocated yet. */ KASSERT(m->object->type == OBJT_SWAP, ("Free object not swappable")); sb = SWAP_PCTRIE_LOOKUP(&m->object->un_pager.swp.swp_blks, rounddown(m->pindex, SWAP_META_PAGES)); if (sb == NULL) return; if (sb->d[m->pindex % SWAP_META_PAGES] == SWAPBLK_NONE) return; swp_pager_freeswapspace(sb->d[m->pindex % SWAP_META_PAGES], 1); sb->d[m->pindex % SWAP_META_PAGES] = SWAPBLK_NONE; swp_pager_free_empty_swblk(m->object, sb); } /* * swap_pager_getpages() - bring pages in from swap * * Attempt to page in the pages in array "ma" of length "count". The * caller may optionally specify that additional pages preceding and * succeeding the specified range be paged in. The number of such pages * is returned in the "rbehind" and "rahead" parameters, and they will * be in the inactive queue upon return. * * The pages in "ma" must be busied and will remain busied upon return. */ static int swap_pager_getpages_locked(vm_object_t object, vm_page_t *ma, int count, int *rbehind, int *rahead) { struct buf *bp; vm_page_t bm, mpred, msucc, p; vm_pindex_t pindex; daddr_t blk; int i, maxahead, maxbehind, reqcount; VM_OBJECT_ASSERT_WLOCKED(object); reqcount = count; KASSERT(object->type == OBJT_SWAP, ("%s: object not swappable", __func__)); if (!swap_pager_haspage(object, ma[0]->pindex, &maxbehind, &maxahead)) { VM_OBJECT_WUNLOCK(object); return (VM_PAGER_FAIL); } KASSERT(reqcount - 1 <= maxahead, ("page count %d extends beyond swap block", reqcount)); /* * Do not transfer any pages other than those that are xbusied * when running during a split or collapse operation. This * prevents clustering from re-creating pages which are being * moved into another object. */ if ((object->flags & (OBJ_SPLIT | OBJ_DEAD)) != 0) { maxahead = reqcount - 1; maxbehind = 0; } /* * Clip the readahead and readbehind ranges to exclude resident pages. */ if (rahead != NULL) { *rahead = imin(*rahead, maxahead - (reqcount - 1)); pindex = ma[reqcount - 1]->pindex; msucc = TAILQ_NEXT(ma[reqcount - 1], listq); if (msucc != NULL && msucc->pindex - pindex - 1 < *rahead) *rahead = msucc->pindex - pindex - 1; } if (rbehind != NULL) { *rbehind = imin(*rbehind, maxbehind); pindex = ma[0]->pindex; mpred = TAILQ_PREV(ma[0], pglist, listq); if (mpred != NULL && pindex - mpred->pindex - 1 < *rbehind) *rbehind = pindex - mpred->pindex - 1; } bm = ma[0]; for (i = 0; i < count; i++) ma[i]->oflags |= VPO_SWAPINPROG; /* * Allocate readahead and readbehind pages. */ if (rbehind != NULL) { for (i = 1; i <= *rbehind; i++) { p = vm_page_alloc(object, ma[0]->pindex - i, VM_ALLOC_NORMAL); if (p == NULL) break; p->oflags |= VPO_SWAPINPROG; bm = p; } *rbehind = i - 1; } if (rahead != NULL) { for (i = 0; i < *rahead; i++) { p = vm_page_alloc(object, ma[reqcount - 1]->pindex + i + 1, VM_ALLOC_NORMAL); if (p == NULL) break; p->oflags |= VPO_SWAPINPROG; } *rahead = i; } if (rbehind != NULL) count += *rbehind; if (rahead != NULL) count += *rahead; vm_object_pip_add(object, count); pindex = bm->pindex; blk = swp_pager_meta_lookup(object, pindex); KASSERT(blk != SWAPBLK_NONE, ("no swap blocking containing %p(%jx)", object, (uintmax_t)pindex)); VM_OBJECT_WUNLOCK(object); bp = uma_zalloc(swrbuf_zone, M_WAITOK); /* Pages cannot leave the object while busy. */ for (i = 0, p = bm; i < count; i++, p = TAILQ_NEXT(p, listq)) { MPASS(p->pindex == bm->pindex + i); bp->b_pages[i] = p; } bp->b_flags |= B_PAGING; bp->b_iocmd = BIO_READ; bp->b_iodone = swp_pager_async_iodone; bp->b_rcred = crhold(thread0.td_ucred); bp->b_wcred = crhold(thread0.td_ucred); bp->b_blkno = blk; bp->b_bcount = PAGE_SIZE * count; bp->b_bufsize = PAGE_SIZE * count; bp->b_npages = count; bp->b_pgbefore = rbehind != NULL ? *rbehind : 0; bp->b_pgafter = rahead != NULL ? *rahead : 0; VM_CNT_INC(v_swapin); VM_CNT_ADD(v_swappgsin, count); /* * perform the I/O. NOTE!!! bp cannot be considered valid after * this point because we automatically release it on completion. * Instead, we look at the one page we are interested in which we * still hold a lock on even through the I/O completion. * * The other pages in our ma[] array are also released on completion, * so we cannot assume they are valid anymore either. * * NOTE: b_blkno is destroyed by the call to swapdev_strategy */ BUF_KERNPROC(bp); swp_pager_strategy(bp); /* * Wait for the pages we want to complete. VPO_SWAPINPROG is always * cleared on completion. If an I/O error occurs, SWAPBLK_NONE * is set in the metadata for each page in the request. */ VM_OBJECT_WLOCK(object); /* This could be implemented more efficiently with aflags */ while ((ma[0]->oflags & VPO_SWAPINPROG) != 0) { ma[0]->oflags |= VPO_SWAPSLEEP; VM_CNT_INC(v_intrans); if (VM_OBJECT_SLEEP(object, &object->handle, PSWP, "swread", hz * 20)) { printf( "swap_pager: indefinite wait buffer: bufobj: %p, blkno: %jd, size: %ld\n", bp->b_bufobj, (intmax_t)bp->b_blkno, bp->b_bcount); } } VM_OBJECT_WUNLOCK(object); /* * If we had an unrecoverable read error pages will not be valid. */ for (i = 0; i < reqcount; i++) if (ma[i]->valid != VM_PAGE_BITS_ALL) return (VM_PAGER_ERROR); return (VM_PAGER_OK); /* * A final note: in a low swap situation, we cannot deallocate swap * and mark a page dirty here because the caller is likely to mark * the page clean when we return, causing the page to possibly revert * to all-zero's later. */ } static int swap_pager_getpages(vm_object_t object, vm_page_t *ma, int count, int *rbehind, int *rahead) { VM_OBJECT_WLOCK(object); return (swap_pager_getpages_locked(object, ma, count, rbehind, rahead)); } /* * swap_pager_getpages_async(): * * Right now this is emulation of asynchronous operation on top of * swap_pager_getpages(). */ static int swap_pager_getpages_async(vm_object_t object, vm_page_t *ma, int count, int *rbehind, int *rahead, pgo_getpages_iodone_t iodone, void *arg) { int r, error; r = swap_pager_getpages(object, ma, count, rbehind, rahead); switch (r) { case VM_PAGER_OK: error = 0; break; case VM_PAGER_ERROR: error = EIO; break; case VM_PAGER_FAIL: error = EINVAL; break; default: panic("unhandled swap_pager_getpages() error %d", r); } (iodone)(arg, ma, count, error); return (r); } /* * swap_pager_putpages: * * Assign swap (if necessary) and initiate I/O on the specified pages. * * We support both OBJT_DEFAULT and OBJT_SWAP objects. DEFAULT objects * are automatically converted to SWAP objects. * * In a low memory situation we may block in VOP_STRATEGY(), but the new * vm_page reservation system coupled with properly written VFS devices * should ensure that no low-memory deadlock occurs. This is an area * which needs work. * * The parent has N vm_object_pip_add() references prior to * calling us and will remove references for rtvals[] that are * not set to VM_PAGER_PEND. We need to remove the rest on I/O * completion. * * The parent has soft-busy'd the pages it passes us and will unbusy * those whose rtvals[] entry is not set to VM_PAGER_PEND on return. * We need to unbusy the rest on I/O completion. */ static void swap_pager_putpages(vm_object_t object, vm_page_t *ma, int count, int flags, int *rtvals) { struct buf *bp; daddr_t addr, blk, n_free, s_free; vm_page_t mreq; int i, j, n; bool async; KASSERT(count == 0 || ma[0]->object == object, ("%s: object mismatch %p/%p", __func__, object, ma[0]->object)); /* * Step 1 * * Turn object into OBJT_SWAP. Force sync if not a pageout process. */ if (object->type != OBJT_SWAP) { addr = swp_pager_meta_build(object, 0, SWAPBLK_NONE); KASSERT(addr == SWAPBLK_NONE, ("unexpected object swap block")); } VM_OBJECT_WUNLOCK(object); async = curproc == pageproc && (flags & VM_PAGER_PUT_SYNC) == 0; swp_pager_init_freerange(&s_free, &n_free); /* * Step 2 * * Assign swap blocks and issue I/O. We reallocate swap on the fly. * The page is left dirty until the pageout operation completes * successfully. */ for (i = 0; i < count; i += n) { /* Maximum I/O size is limited by maximum swap block size. */ n = min(count - i, nsw_cluster_max); if (async) { mtx_lock(&swbuf_mtx); while (nsw_wcount_async == 0) msleep(&nsw_wcount_async, &swbuf_mtx, PVM, "swbufa", 0); nsw_wcount_async--; mtx_unlock(&swbuf_mtx); } /* Get a block of swap of size up to size n. */ VM_OBJECT_WLOCK(object); blk = swp_pager_getswapspace(&n); if (blk == SWAPBLK_NONE) { VM_OBJECT_WUNLOCK(object); mtx_lock(&swbuf_mtx); if (++nsw_wcount_async == 1) wakeup(&nsw_wcount_async); mtx_unlock(&swbuf_mtx); for (j = 0; j < n; ++j) rtvals[i + j] = VM_PAGER_FAIL; continue; } for (j = 0; j < n; ++j) { mreq = ma[i + j]; vm_page_aflag_clear(mreq, PGA_SWAP_FREE); addr = swp_pager_meta_build(mreq->object, mreq->pindex, blk + j); if (addr != SWAPBLK_NONE) swp_pager_update_freerange(&s_free, &n_free, addr); MPASS(mreq->dirty == VM_PAGE_BITS_ALL); mreq->oflags |= VPO_SWAPINPROG; } VM_OBJECT_WUNLOCK(object); bp = uma_zalloc(swwbuf_zone, M_WAITOK); if (async) bp->b_flags = B_ASYNC; bp->b_flags |= B_PAGING; bp->b_iocmd = BIO_WRITE; bp->b_rcred = crhold(thread0.td_ucred); bp->b_wcred = crhold(thread0.td_ucred); bp->b_bcount = PAGE_SIZE * n; bp->b_bufsize = PAGE_SIZE * n; bp->b_blkno = blk; for (j = 0; j < n; j++) bp->b_pages[j] = ma[i + j]; bp->b_npages = n; /* * Must set dirty range for NFS to work. */ bp->b_dirtyoff = 0; bp->b_dirtyend = bp->b_bcount; VM_CNT_INC(v_swapout); VM_CNT_ADD(v_swappgsout, bp->b_npages); /* * We unconditionally set rtvals[] to VM_PAGER_PEND so that we * can call the async completion routine at the end of a * synchronous I/O operation. Otherwise, our caller would * perform duplicate unbusy and wakeup operations on the page * and object, respectively. */ for (j = 0; j < n; j++) rtvals[i + j] = VM_PAGER_PEND; /* * asynchronous * * NOTE: b_blkno is destroyed by the call to swapdev_strategy. */ if (async) { bp->b_iodone = swp_pager_async_iodone; BUF_KERNPROC(bp); swp_pager_strategy(bp); continue; } /* * synchronous * * NOTE: b_blkno is destroyed by the call to swapdev_strategy. */ bp->b_iodone = bdone; swp_pager_strategy(bp); /* * Wait for the sync I/O to complete. */ bwait(bp, PVM, "swwrt"); /* * Now that we are through with the bp, we can call the * normal async completion, which frees everything up. */ swp_pager_async_iodone(bp); } swp_pager_freeswapspace(s_free, n_free); VM_OBJECT_WLOCK(object); } /* * swp_pager_async_iodone: * * Completion routine for asynchronous reads and writes from/to swap. * Also called manually by synchronous code to finish up a bp. * * This routine may not sleep. */ static void swp_pager_async_iodone(struct buf *bp) { int i; vm_object_t object = NULL; /* * Report error - unless we ran out of memory, in which case * we've already logged it in swapgeom_strategy(). */ if (bp->b_ioflags & BIO_ERROR && bp->b_error != ENOMEM) { printf( "swap_pager: I/O error - %s failed; blkno %ld," "size %ld, error %d\n", ((bp->b_iocmd == BIO_READ) ? "pagein" : "pageout"), (long)bp->b_blkno, (long)bp->b_bcount, bp->b_error ); } /* * remove the mapping for kernel virtual */ if (buf_mapped(bp)) pmap_qremove((vm_offset_t)bp->b_data, bp->b_npages); else bp->b_data = bp->b_kvabase; if (bp->b_npages) { object = bp->b_pages[0]->object; VM_OBJECT_WLOCK(object); } /* * cleanup pages. If an error occurs writing to swap, we are in * very serious trouble. If it happens to be a disk error, though, * we may be able to recover by reassigning the swap later on. So * in this case we remove the m->swapblk assignment for the page * but do not free it in the rlist. The errornous block(s) are thus * never reallocated as swap. Redirty the page and continue. */ for (i = 0; i < bp->b_npages; ++i) { vm_page_t m = bp->b_pages[i]; m->oflags &= ~VPO_SWAPINPROG; if (m->oflags & VPO_SWAPSLEEP) { m->oflags &= ~VPO_SWAPSLEEP; wakeup(&object->handle); } /* We always have space after I/O, successful or not. */ vm_page_aflag_set(m, PGA_SWAP_SPACE); if (bp->b_ioflags & BIO_ERROR) { /* * If an error occurs I'd love to throw the swapblk * away without freeing it back to swapspace, so it * can never be used again. But I can't from an * interrupt. */ if (bp->b_iocmd == BIO_READ) { /* * NOTE: for reads, m->dirty will probably * be overridden by the original caller of * getpages so don't play cute tricks here. */ vm_page_invalid(m); } else { /* * If a write error occurs, reactivate page * so it doesn't clog the inactive list, * then finish the I/O. */ MPASS(m->dirty == VM_PAGE_BITS_ALL); /* PQ_UNSWAPPABLE? */ vm_page_activate(m); vm_page_sunbusy(m); } } else if (bp->b_iocmd == BIO_READ) { /* * NOTE: for reads, m->dirty will probably be * overridden by the original caller of getpages so * we cannot set them in order to free the underlying * swap in a low-swap situation. I don't think we'd * want to do that anyway, but it was an optimization * that existed in the old swapper for a time before * it got ripped out due to precisely this problem. */ KASSERT(!pmap_page_is_mapped(m), ("swp_pager_async_iodone: page %p is mapped", m)); KASSERT(m->dirty == 0, ("swp_pager_async_iodone: page %p is dirty", m)); vm_page_valid(m); if (i < bp->b_pgbefore || i >= bp->b_npages - bp->b_pgafter) vm_page_readahead_finish(m); } else { /* * For write success, clear the dirty * status, then finish the I/O ( which decrements the * busy count and possibly wakes waiter's up ). * A page is only written to swap after a period of * inactivity. Therefore, we do not expect it to be * reused. */ KASSERT(!pmap_page_is_write_mapped(m), ("swp_pager_async_iodone: page %p is not write" " protected", m)); vm_page_undirty(m); vm_page_deactivate_noreuse(m); vm_page_sunbusy(m); } } /* * adjust pip. NOTE: the original parent may still have its own * pip refs on the object. */ if (object != NULL) { vm_object_pip_wakeupn(object, bp->b_npages); VM_OBJECT_WUNLOCK(object); } /* * swapdev_strategy() manually sets b_vp and b_bufobj before calling * bstrategy(). Set them back to NULL now we're done with it, or we'll * trigger a KASSERT in relpbuf(). */ if (bp->b_vp) { bp->b_vp = NULL; bp->b_bufobj = NULL; } /* * release the physical I/O buffer */ if (bp->b_flags & B_ASYNC) { mtx_lock(&swbuf_mtx); if (++nsw_wcount_async == 1) wakeup(&nsw_wcount_async); mtx_unlock(&swbuf_mtx); } uma_zfree((bp->b_iocmd == BIO_READ) ? swrbuf_zone : swwbuf_zone, bp); } int swap_pager_nswapdev(void) { return (nswapdev); } static void swp_pager_force_dirty(vm_page_t m) { vm_page_dirty(m); swap_pager_unswapped(m); vm_page_launder(m); } /* * swap_pager_swapoff_object: * * Page in all of the pages that have been paged out for an object * to a swap device. */ static void swap_pager_swapoff_object(struct swdevt *sp, vm_object_t object) { struct swblk *sb; vm_page_t m; vm_pindex_t pi; daddr_t blk; int i, nv, rahead, rv; KASSERT(object->type == OBJT_SWAP, ("%s: Object not swappable", __func__)); for (pi = 0; (sb = SWAP_PCTRIE_LOOKUP_GE( &object->un_pager.swp.swp_blks, pi)) != NULL; ) { if ((object->flags & OBJ_DEAD) != 0) { /* * Make sure that pending writes finish before * returning. */ vm_object_pip_wait(object, "swpoff"); swp_pager_meta_free_all(object); break; } for (i = 0; i < SWAP_META_PAGES; i++) { /* * Count the number of contiguous valid blocks. */ for (nv = 0; nv < SWAP_META_PAGES - i; nv++) { blk = sb->d[i + nv]; if (!swp_pager_isondev(blk, sp) || blk == SWAPBLK_NONE) break; } if (nv == 0) continue; /* * Look for a page corresponding to the first * valid block and ensure that any pending paging * operations on it are complete. If the page is valid, * mark it dirty and free the swap block. Try to batch * this operation since it may cause sp to be freed, * meaning that we must restart the scan. Avoid busying * valid pages since we may block forever on kernel * stack pages. */ m = vm_page_lookup(object, sb->p + i); if (m == NULL) { m = vm_page_alloc(object, sb->p + i, VM_ALLOC_NORMAL | VM_ALLOC_WAITFAIL); if (m == NULL) break; } else { if ((m->oflags & VPO_SWAPINPROG) != 0) { m->oflags |= VPO_SWAPSLEEP; VM_OBJECT_SLEEP(object, &object->handle, PSWP, "swpoff", 0); break; } if (vm_page_all_valid(m)) { do { swp_pager_force_dirty(m); } while (--nv > 0 && (m = vm_page_next(m)) != NULL && vm_page_all_valid(m) && (m->oflags & VPO_SWAPINPROG) == 0); break; } if (!vm_page_busy_acquire(m, VM_ALLOC_WAITFAIL)) break; } vm_object_pip_add(object, 1); rahead = SWAP_META_PAGES; rv = swap_pager_getpages_locked(object, &m, 1, NULL, &rahead); if (rv != VM_PAGER_OK) panic("%s: read from swap failed: %d", __func__, rv); vm_object_pip_wakeupn(object, 1); VM_OBJECT_WLOCK(object); vm_page_xunbusy(m); /* * The object lock was dropped so we must restart the * scan of this swap block. Pages paged in during this * iteration will be marked dirty in a future iteration. */ break; } if (i == SWAP_META_PAGES) pi = sb->p + SWAP_META_PAGES; } } /* * swap_pager_swapoff: * * Page in all of the pages that have been paged out to the * given device. The corresponding blocks in the bitmap must be * marked as allocated and the device must be flagged SW_CLOSING. * There may be no processes swapped out to the device. * * This routine may block. */ static void swap_pager_swapoff(struct swdevt *sp) { vm_object_t object; int retries; sx_assert(&swdev_syscall_lock, SA_XLOCKED); retries = 0; full_rescan: mtx_lock(&vm_object_list_mtx); TAILQ_FOREACH(object, &vm_object_list, object_list) { if (object->type != OBJT_SWAP) continue; mtx_unlock(&vm_object_list_mtx); /* Depends on type-stability. */ VM_OBJECT_WLOCK(object); /* * Dead objects are eventually terminated on their own. */ if ((object->flags & OBJ_DEAD) != 0) goto next_obj; /* * Sync with fences placed after pctrie * initialization. We must not access pctrie below * unless we checked that our object is swap and not * dead. */ atomic_thread_fence_acq(); if (object->type != OBJT_SWAP) goto next_obj; swap_pager_swapoff_object(sp, object); next_obj: VM_OBJECT_WUNLOCK(object); mtx_lock(&vm_object_list_mtx); } mtx_unlock(&vm_object_list_mtx); if (sp->sw_used) { /* * Objects may be locked or paging to the device being * removed, so we will miss their pages and need to * make another pass. We have marked this device as * SW_CLOSING, so the activity should finish soon. */ retries++; if (retries > 100) { panic("swapoff: failed to locate %d swap blocks", sp->sw_used); } pause("swpoff", hz / 20); goto full_rescan; } EVENTHANDLER_INVOKE(swapoff, sp); } /************************************************************************ * SWAP META DATA * ************************************************************************ * * These routines manipulate the swap metadata stored in the * OBJT_SWAP object. * * Swap metadata is implemented with a global hash and not directly * linked into the object. Instead the object simply contains * appropriate tracking counters. */ /* * SWP_PAGER_SWBLK_EMPTY() - is a range of blocks free? */ static bool swp_pager_swblk_empty(struct swblk *sb, int start, int limit) { int i; MPASS(0 <= start && start <= limit && limit <= SWAP_META_PAGES); for (i = start; i < limit; i++) { if (sb->d[i] != SWAPBLK_NONE) return (false); } return (true); } /* * SWP_PAGER_FREE_EMPTY_SWBLK() - frees if a block is free * * Nothing is done if the block is still in use. */ static void swp_pager_free_empty_swblk(vm_object_t object, struct swblk *sb) { if (swp_pager_swblk_empty(sb, 0, SWAP_META_PAGES)) { SWAP_PCTRIE_REMOVE(&object->un_pager.swp.swp_blks, sb->p); uma_zfree(swblk_zone, sb); } } /* * SWP_PAGER_META_BUILD() - add swap block to swap meta data for object * * We first convert the object to a swap object if it is a default * object. * * The specified swapblk is added to the object's swap metadata. If * the swapblk is not valid, it is freed instead. Any previously * assigned swapblk is returned. */ static daddr_t swp_pager_meta_build(vm_object_t object, vm_pindex_t pindex, daddr_t swapblk) { static volatile int swblk_zone_exhausted, swpctrie_zone_exhausted; struct swblk *sb, *sb1; vm_pindex_t modpi, rdpi; daddr_t prev_swapblk; int error, i; VM_OBJECT_ASSERT_WLOCKED(object); /* * Convert default object to swap object if necessary */ if (object->type != OBJT_SWAP) { pctrie_init(&object->un_pager.swp.swp_blks); /* * Ensure that swap_pager_swapoff()'s iteration over * object_list does not see a garbage pctrie. */ atomic_thread_fence_rel(); object->type = OBJT_SWAP; object->un_pager.swp.writemappings = 0; KASSERT((object->flags & OBJ_ANON) != 0 || object->handle == NULL, ("default pager %p with handle %p", object, object->handle)); } rdpi = rounddown(pindex, SWAP_META_PAGES); sb = SWAP_PCTRIE_LOOKUP(&object->un_pager.swp.swp_blks, rdpi); if (sb == NULL) { if (swapblk == SWAPBLK_NONE) return (SWAPBLK_NONE); for (;;) { sb = uma_zalloc(swblk_zone, M_NOWAIT | (curproc == pageproc ? M_USE_RESERVE : 0)); if (sb != NULL) { sb->p = rdpi; for (i = 0; i < SWAP_META_PAGES; i++) sb->d[i] = SWAPBLK_NONE; if (atomic_cmpset_int(&swblk_zone_exhausted, 1, 0)) printf("swblk zone ok\n"); break; } VM_OBJECT_WUNLOCK(object); if (uma_zone_exhausted(swblk_zone)) { if (atomic_cmpset_int(&swblk_zone_exhausted, 0, 1)) printf("swap blk zone exhausted, " "increase kern.maxswzone\n"); vm_pageout_oom(VM_OOM_SWAPZ); pause("swzonxb", 10); } else uma_zwait(swblk_zone); VM_OBJECT_WLOCK(object); sb = SWAP_PCTRIE_LOOKUP(&object->un_pager.swp.swp_blks, rdpi); if (sb != NULL) /* * Somebody swapped out a nearby page, * allocating swblk at the rdpi index, * while we dropped the object lock. */ goto allocated; } for (;;) { error = SWAP_PCTRIE_INSERT( &object->un_pager.swp.swp_blks, sb); if (error == 0) { if (atomic_cmpset_int(&swpctrie_zone_exhausted, 1, 0)) printf("swpctrie zone ok\n"); break; } VM_OBJECT_WUNLOCK(object); if (uma_zone_exhausted(swpctrie_zone)) { if (atomic_cmpset_int(&swpctrie_zone_exhausted, 0, 1)) printf("swap pctrie zone exhausted, " "increase kern.maxswzone\n"); vm_pageout_oom(VM_OOM_SWAPZ); pause("swzonxp", 10); } else uma_zwait(swpctrie_zone); VM_OBJECT_WLOCK(object); sb1 = SWAP_PCTRIE_LOOKUP(&object->un_pager.swp.swp_blks, rdpi); if (sb1 != NULL) { uma_zfree(swblk_zone, sb); sb = sb1; goto allocated; } } } allocated: MPASS(sb->p == rdpi); modpi = pindex % SWAP_META_PAGES; /* Return prior contents of metadata. */ prev_swapblk = sb->d[modpi]; /* Enter block into metadata. */ sb->d[modpi] = swapblk; /* * Free the swblk if we end up with the empty page run. */ if (swapblk == SWAPBLK_NONE) swp_pager_free_empty_swblk(object, sb); return (prev_swapblk); } /* * SWP_PAGER_META_TRANSFER() - free a range of blocks in the srcobject's swap * metadata, or transfer it into dstobject. * * This routine will free swap metadata structures as they are cleaned * out. */ static void swp_pager_meta_transfer(vm_object_t srcobject, vm_object_t dstobject, vm_pindex_t pindex, vm_pindex_t count) { struct swblk *sb; daddr_t n_free, s_free; vm_pindex_t offset, last; int i, limit, start; VM_OBJECT_ASSERT_WLOCKED(srcobject); if (srcobject->type != OBJT_SWAP || count == 0) return; swp_pager_init_freerange(&s_free, &n_free); offset = pindex; last = pindex + count; for (;;) { sb = SWAP_PCTRIE_LOOKUP_GE(&srcobject->un_pager.swp.swp_blks, rounddown(pindex, SWAP_META_PAGES)); if (sb == NULL || sb->p >= last) break; start = pindex > sb->p ? pindex - sb->p : 0; limit = last - sb->p < SWAP_META_PAGES ? last - sb->p : SWAP_META_PAGES; for (i = start; i < limit; i++) { if (sb->d[i] == SWAPBLK_NONE) continue; if (dstobject == NULL || !swp_pager_xfer_source(srcobject, dstobject, sb->p + i - offset, sb->d[i])) { swp_pager_update_freerange(&s_free, &n_free, sb->d[i]); } sb->d[i] = SWAPBLK_NONE; } pindex = sb->p + SWAP_META_PAGES; if (swp_pager_swblk_empty(sb, 0, start) && swp_pager_swblk_empty(sb, limit, SWAP_META_PAGES)) { SWAP_PCTRIE_REMOVE(&srcobject->un_pager.swp.swp_blks, sb->p); uma_zfree(swblk_zone, sb); } } swp_pager_freeswapspace(s_free, n_free); } /* * SWP_PAGER_META_FREE() - free a range of blocks in the object's swap metadata * * The requested range of blocks is freed, with any associated swap * returned to the swap bitmap. * * This routine will free swap metadata structures as they are cleaned * out. This routine does *NOT* operate on swap metadata associated * with resident pages. */ static void swp_pager_meta_free(vm_object_t object, vm_pindex_t pindex, vm_pindex_t count) { swp_pager_meta_transfer(object, NULL, pindex, count); } /* * SWP_PAGER_META_FREE_ALL() - destroy all swap metadata associated with object * * This routine locates and destroys all swap metadata associated with * an object. */ static void swp_pager_meta_free_all(vm_object_t object) { struct swblk *sb; daddr_t n_free, s_free; vm_pindex_t pindex; int i; VM_OBJECT_ASSERT_WLOCKED(object); if (object->type != OBJT_SWAP) return; swp_pager_init_freerange(&s_free, &n_free); for (pindex = 0; (sb = SWAP_PCTRIE_LOOKUP_GE( &object->un_pager.swp.swp_blks, pindex)) != NULL;) { pindex = sb->p + SWAP_META_PAGES; for (i = 0; i < SWAP_META_PAGES; i++) { if (sb->d[i] == SWAPBLK_NONE) continue; swp_pager_update_freerange(&s_free, &n_free, sb->d[i]); } SWAP_PCTRIE_REMOVE(&object->un_pager.swp.swp_blks, sb->p); uma_zfree(swblk_zone, sb); } swp_pager_freeswapspace(s_free, n_free); } /* * SWP_PAGER_METACTL() - misc control of swap meta data. * * This routine is capable of looking up, or removing swapblk * assignments in the swap meta data. It returns the swapblk being * looked-up, popped, or SWAPBLK_NONE if the block was invalid. * * When acting on a busy resident page and paging is in progress, we * have to wait until paging is complete but otherwise can act on the * busy page. */ static daddr_t swp_pager_meta_lookup(vm_object_t object, vm_pindex_t pindex) { struct swblk *sb; VM_OBJECT_ASSERT_LOCKED(object); /* * The meta data only exists if the object is OBJT_SWAP * and even then might not be allocated yet. */ KASSERT(object->type == OBJT_SWAP, ("Lookup object not swappable")); sb = SWAP_PCTRIE_LOOKUP(&object->un_pager.swp.swp_blks, rounddown(pindex, SWAP_META_PAGES)); if (sb == NULL) return (SWAPBLK_NONE); return (sb->d[pindex % SWAP_META_PAGES]); } /* * Returns the least page index which is greater than or equal to the * parameter pindex and for which there is a swap block allocated. * Returns object's size if the object's type is not swap or if there * are no allocated swap blocks for the object after the requested * pindex. */ vm_pindex_t swap_pager_find_least(vm_object_t object, vm_pindex_t pindex) { struct swblk *sb; int i; VM_OBJECT_ASSERT_LOCKED(object); if (object->type != OBJT_SWAP) return (object->size); sb = SWAP_PCTRIE_LOOKUP_GE(&object->un_pager.swp.swp_blks, rounddown(pindex, SWAP_META_PAGES)); if (sb == NULL) return (object->size); if (sb->p < pindex) { for (i = pindex % SWAP_META_PAGES; i < SWAP_META_PAGES; i++) { if (sb->d[i] != SWAPBLK_NONE) return (sb->p + i); } sb = SWAP_PCTRIE_LOOKUP_GE(&object->un_pager.swp.swp_blks, roundup(pindex, SWAP_META_PAGES)); if (sb == NULL) return (object->size); } for (i = 0; i < SWAP_META_PAGES; i++) { if (sb->d[i] != SWAPBLK_NONE) return (sb->p + i); } /* * We get here if a swblk is present in the trie but it * doesn't map any blocks. */ MPASS(0); return (object->size); } /* * System call swapon(name) enables swapping on device name, * which must be in the swdevsw. Return EBUSY * if already swapping on this device. */ #ifndef _SYS_SYSPROTO_H_ struct swapon_args { char *name; }; #endif /* * MPSAFE */ /* ARGSUSED */ int sys_swapon(struct thread *td, struct swapon_args *uap) { struct vattr attr; struct vnode *vp; struct nameidata nd; int error; error = priv_check(td, PRIV_SWAPON); if (error) return (error); sx_xlock(&swdev_syscall_lock); /* * Swap metadata may not fit in the KVM if we have physical * memory of >1GB. */ if (swblk_zone == NULL) { error = ENOMEM; goto done; } NDINIT(&nd, LOOKUP, ISOPEN | FOLLOW | AUDITVNODE1, UIO_USERSPACE, uap->name, td); error = namei(&nd); if (error) goto done; NDFREE(&nd, NDF_ONLY_PNBUF); vp = nd.ni_vp; - if (vn_isdisk(vp, &error)) { + if (vn_isdisk_error(vp, &error)) { error = swapongeom(vp); } else if (vp->v_type == VREG && (vp->v_mount->mnt_vfc->vfc_flags & VFCF_NETWORK) != 0 && (error = VOP_GETATTR(vp, &attr, td->td_ucred)) == 0) { /* * Allow direct swapping to NFS regular files in the same * way that nfs_mountroot() sets up diskless swapping. */ error = swaponvp(td, vp, attr.va_size / DEV_BSIZE); } if (error) vrele(vp); done: sx_xunlock(&swdev_syscall_lock); return (error); } /* * Check that the total amount of swap currently configured does not * exceed half the theoretical maximum. If it does, print a warning * message. */ static void swapon_check_swzone(void) { /* recommend using no more than half that amount */ if (swap_total > swap_maxpages / 2) { printf("warning: total configured swap (%lu pages) " "exceeds maximum recommended amount (%lu pages).\n", swap_total, swap_maxpages / 2); printf("warning: increase kern.maxswzone " "or reduce amount of swap.\n"); } } static void swaponsomething(struct vnode *vp, void *id, u_long nblks, sw_strategy_t *strategy, sw_close_t *close, dev_t dev, int flags) { struct swdevt *sp, *tsp; daddr_t dvbase; /* * nblks is in DEV_BSIZE'd chunks, convert to PAGE_SIZE'd chunks. * First chop nblks off to page-align it, then convert. * * sw->sw_nblks is in page-sized chunks now too. */ nblks &= ~(ctodb(1) - 1); nblks = dbtoc(nblks); sp = malloc(sizeof *sp, M_VMPGDATA, M_WAITOK | M_ZERO); sp->sw_blist = blist_create(nblks, M_WAITOK); sp->sw_vp = vp; sp->sw_id = id; sp->sw_dev = dev; sp->sw_nblks = nblks; sp->sw_used = 0; sp->sw_strategy = strategy; sp->sw_close = close; sp->sw_flags = flags; /* * Do not free the first blocks in order to avoid overwriting * any bsd label at the front of the partition */ blist_free(sp->sw_blist, howmany(BBSIZE, PAGE_SIZE), nblks - howmany(BBSIZE, PAGE_SIZE)); dvbase = 0; mtx_lock(&sw_dev_mtx); TAILQ_FOREACH(tsp, &swtailq, sw_list) { if (tsp->sw_end >= dvbase) { /* * We put one uncovered page between the devices * in order to definitively prevent any cross-device * I/O requests */ dvbase = tsp->sw_end + 1; } } sp->sw_first = dvbase; sp->sw_end = dvbase + nblks; TAILQ_INSERT_TAIL(&swtailq, sp, sw_list); nswapdev++; swap_pager_avail += nblks - howmany(BBSIZE, PAGE_SIZE); swap_total += nblks; swapon_check_swzone(); swp_sizecheck(); mtx_unlock(&sw_dev_mtx); EVENTHANDLER_INVOKE(swapon, sp); } /* * SYSCALL: swapoff(devname) * * Disable swapping on the given device. * * XXX: Badly designed system call: it should use a device index * rather than filename as specification. We keep sw_vp around * only to make this work. */ #ifndef _SYS_SYSPROTO_H_ struct swapoff_args { char *name; }; #endif /* * MPSAFE */ /* ARGSUSED */ int sys_swapoff(struct thread *td, struct swapoff_args *uap) { struct vnode *vp; struct nameidata nd; struct swdevt *sp; int error; error = priv_check(td, PRIV_SWAPOFF); if (error) return (error); sx_xlock(&swdev_syscall_lock); NDINIT(&nd, LOOKUP, FOLLOW | AUDITVNODE1, UIO_USERSPACE, uap->name, td); error = namei(&nd); if (error) goto done; NDFREE(&nd, NDF_ONLY_PNBUF); vp = nd.ni_vp; mtx_lock(&sw_dev_mtx); TAILQ_FOREACH(sp, &swtailq, sw_list) { if (sp->sw_vp == vp) break; } mtx_unlock(&sw_dev_mtx); if (sp == NULL) { error = EINVAL; goto done; } error = swapoff_one(sp, td->td_ucred); done: sx_xunlock(&swdev_syscall_lock); return (error); } static int swapoff_one(struct swdevt *sp, struct ucred *cred) { u_long nblks; #ifdef MAC int error; #endif sx_assert(&swdev_syscall_lock, SA_XLOCKED); #ifdef MAC (void) vn_lock(sp->sw_vp, LK_EXCLUSIVE | LK_RETRY); error = mac_system_check_swapoff(cred, sp->sw_vp); (void) VOP_UNLOCK(sp->sw_vp); if (error != 0) return (error); #endif nblks = sp->sw_nblks; /* * We can turn off this swap device safely only if the * available virtual memory in the system will fit the amount * of data we will have to page back in, plus an epsilon so * the system doesn't become critically low on swap space. */ if (vm_free_count() + swap_pager_avail < nblks + nswap_lowat) return (ENOMEM); /* * Prevent further allocations on this device. */ mtx_lock(&sw_dev_mtx); sp->sw_flags |= SW_CLOSING; swap_pager_avail -= blist_fill(sp->sw_blist, 0, nblks); swap_total -= nblks; mtx_unlock(&sw_dev_mtx); /* * Page in the contents of the device and close it. */ swap_pager_swapoff(sp); sp->sw_close(curthread, sp); mtx_lock(&sw_dev_mtx); sp->sw_id = NULL; TAILQ_REMOVE(&swtailq, sp, sw_list); nswapdev--; if (nswapdev == 0) { swap_pager_full = 2; swap_pager_almost_full = 1; } if (swdevhd == sp) swdevhd = NULL; mtx_unlock(&sw_dev_mtx); blist_destroy(sp->sw_blist); free(sp, M_VMPGDATA); return (0); } void swapoff_all(void) { struct swdevt *sp, *spt; const char *devname; int error; sx_xlock(&swdev_syscall_lock); mtx_lock(&sw_dev_mtx); TAILQ_FOREACH_SAFE(sp, &swtailq, sw_list, spt) { mtx_unlock(&sw_dev_mtx); - if (vn_isdisk(sp->sw_vp, NULL)) + if (vn_isdisk(sp->sw_vp)) devname = devtoname(sp->sw_vp->v_rdev); else devname = "[file]"; error = swapoff_one(sp, thread0.td_ucred); if (error != 0) { printf("Cannot remove swap device %s (error=%d), " "skipping.\n", devname, error); } else if (bootverbose) { printf("Swap device %s removed.\n", devname); } mtx_lock(&sw_dev_mtx); } mtx_unlock(&sw_dev_mtx); sx_xunlock(&swdev_syscall_lock); } void swap_pager_status(int *total, int *used) { *total = swap_total; *used = swap_total - swap_pager_avail - nswapdev * howmany(BBSIZE, PAGE_SIZE); } int swap_dev_info(int name, struct xswdev *xs, char *devname, size_t len) { struct swdevt *sp; const char *tmp_devname; int error, n; n = 0; error = ENOENT; mtx_lock(&sw_dev_mtx); TAILQ_FOREACH(sp, &swtailq, sw_list) { if (n != name) { n++; continue; } xs->xsw_version = XSWDEV_VERSION; xs->xsw_dev = sp->sw_dev; xs->xsw_flags = sp->sw_flags; xs->xsw_nblks = sp->sw_nblks; xs->xsw_used = sp->sw_used; if (devname != NULL) { - if (vn_isdisk(sp->sw_vp, NULL)) + if (vn_isdisk(sp->sw_vp)) tmp_devname = devtoname(sp->sw_vp->v_rdev); else tmp_devname = "[file]"; strncpy(devname, tmp_devname, len); } error = 0; break; } mtx_unlock(&sw_dev_mtx); return (error); } #if defined(COMPAT_FREEBSD11) #define XSWDEV_VERSION_11 1 struct xswdev11 { u_int xsw_version; uint32_t xsw_dev; int xsw_flags; int xsw_nblks; int xsw_used; }; #endif #if defined(__amd64__) && defined(COMPAT_FREEBSD32) struct xswdev32 { u_int xsw_version; u_int xsw_dev1, xsw_dev2; int xsw_flags; int xsw_nblks; int xsw_used; }; #endif static int sysctl_vm_swap_info(SYSCTL_HANDLER_ARGS) { struct xswdev xs; #if defined(__amd64__) && defined(COMPAT_FREEBSD32) struct xswdev32 xs32; #endif #if defined(COMPAT_FREEBSD11) struct xswdev11 xs11; #endif int error; if (arg2 != 1) /* name length */ return (EINVAL); error = swap_dev_info(*(int *)arg1, &xs, NULL, 0); if (error != 0) return (error); #if defined(__amd64__) && defined(COMPAT_FREEBSD32) if (req->oldlen == sizeof(xs32)) { xs32.xsw_version = XSWDEV_VERSION; xs32.xsw_dev1 = xs.xsw_dev; xs32.xsw_dev2 = xs.xsw_dev >> 32; xs32.xsw_flags = xs.xsw_flags; xs32.xsw_nblks = xs.xsw_nblks; xs32.xsw_used = xs.xsw_used; error = SYSCTL_OUT(req, &xs32, sizeof(xs32)); return (error); } #endif #if defined(COMPAT_FREEBSD11) if (req->oldlen == sizeof(xs11)) { xs11.xsw_version = XSWDEV_VERSION_11; xs11.xsw_dev = xs.xsw_dev; /* truncation */ xs11.xsw_flags = xs.xsw_flags; xs11.xsw_nblks = xs.xsw_nblks; xs11.xsw_used = xs.xsw_used; error = SYSCTL_OUT(req, &xs11, sizeof(xs11)); return (error); } #endif error = SYSCTL_OUT(req, &xs, sizeof(xs)); return (error); } SYSCTL_INT(_vm, OID_AUTO, nswapdev, CTLFLAG_RD, &nswapdev, 0, "Number of swap devices"); SYSCTL_NODE(_vm, OID_AUTO, swap_info, CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_vm_swap_info, "Swap statistics by device"); /* * Count the approximate swap usage in pages for a vmspace. The * shadowed or not yet copied on write swap blocks are not accounted. * The map must be locked. */ long vmspace_swap_count(struct vmspace *vmspace) { vm_map_t map; vm_map_entry_t cur; vm_object_t object; struct swblk *sb; vm_pindex_t e, pi; long count; int i; map = &vmspace->vm_map; count = 0; VM_MAP_ENTRY_FOREACH(cur, map) { if ((cur->eflags & MAP_ENTRY_IS_SUB_MAP) != 0) continue; object = cur->object.vm_object; if (object == NULL || object->type != OBJT_SWAP) continue; VM_OBJECT_RLOCK(object); if (object->type != OBJT_SWAP) goto unlock; pi = OFF_TO_IDX(cur->offset); e = pi + OFF_TO_IDX(cur->end - cur->start); for (;; pi = sb->p + SWAP_META_PAGES) { sb = SWAP_PCTRIE_LOOKUP_GE( &object->un_pager.swp.swp_blks, pi); if (sb == NULL || sb->p >= e) break; for (i = 0; i < SWAP_META_PAGES; i++) { if (sb->p + i < e && sb->d[i] != SWAPBLK_NONE) count++; } } unlock: VM_OBJECT_RUNLOCK(object); } return (count); } /* * GEOM backend * * Swapping onto disk devices. * */ static g_orphan_t swapgeom_orphan; static struct g_class g_swap_class = { .name = "SWAP", .version = G_VERSION, .orphan = swapgeom_orphan, }; DECLARE_GEOM_CLASS(g_swap_class, g_class); static void swapgeom_close_ev(void *arg, int flags) { struct g_consumer *cp; cp = arg; g_access(cp, -1, -1, 0); g_detach(cp); g_destroy_consumer(cp); } /* * Add a reference to the g_consumer for an inflight transaction. */ static void swapgeom_acquire(struct g_consumer *cp) { mtx_assert(&sw_dev_mtx, MA_OWNED); cp->index++; } /* * Remove a reference from the g_consumer. Post a close event if all * references go away, since the function might be called from the * biodone context. */ static void swapgeom_release(struct g_consumer *cp, struct swdevt *sp) { mtx_assert(&sw_dev_mtx, MA_OWNED); cp->index--; if (cp->index == 0) { if (g_post_event(swapgeom_close_ev, cp, M_NOWAIT, NULL) == 0) sp->sw_id = NULL; } } static void swapgeom_done(struct bio *bp2) { struct swdevt *sp; struct buf *bp; struct g_consumer *cp; bp = bp2->bio_caller2; cp = bp2->bio_from; bp->b_ioflags = bp2->bio_flags; if (bp2->bio_error) bp->b_ioflags |= BIO_ERROR; bp->b_resid = bp->b_bcount - bp2->bio_completed; bp->b_error = bp2->bio_error; bp->b_caller1 = NULL; bufdone(bp); sp = bp2->bio_caller1; mtx_lock(&sw_dev_mtx); swapgeom_release(cp, sp); mtx_unlock(&sw_dev_mtx); g_destroy_bio(bp2); } static void swapgeom_strategy(struct buf *bp, struct swdevt *sp) { struct bio *bio; struct g_consumer *cp; mtx_lock(&sw_dev_mtx); cp = sp->sw_id; if (cp == NULL) { mtx_unlock(&sw_dev_mtx); bp->b_error = ENXIO; bp->b_ioflags |= BIO_ERROR; bufdone(bp); return; } swapgeom_acquire(cp); mtx_unlock(&sw_dev_mtx); if (bp->b_iocmd == BIO_WRITE) bio = g_new_bio(); else bio = g_alloc_bio(); if (bio == NULL) { mtx_lock(&sw_dev_mtx); swapgeom_release(cp, sp); mtx_unlock(&sw_dev_mtx); bp->b_error = ENOMEM; bp->b_ioflags |= BIO_ERROR; printf("swap_pager: cannot allocate bio\n"); bufdone(bp); return; } bp->b_caller1 = bio; bio->bio_caller1 = sp; bio->bio_caller2 = bp; bio->bio_cmd = bp->b_iocmd; bio->bio_offset = (bp->b_blkno - sp->sw_first) * PAGE_SIZE; bio->bio_length = bp->b_bcount; bio->bio_done = swapgeom_done; if (!buf_mapped(bp)) { bio->bio_ma = bp->b_pages; bio->bio_data = unmapped_buf; bio->bio_ma_offset = (vm_offset_t)bp->b_offset & PAGE_MASK; bio->bio_ma_n = bp->b_npages; bio->bio_flags |= BIO_UNMAPPED; } else { bio->bio_data = bp->b_data; bio->bio_ma = NULL; } g_io_request(bio, cp); return; } static void swapgeom_orphan(struct g_consumer *cp) { struct swdevt *sp; int destroy; mtx_lock(&sw_dev_mtx); TAILQ_FOREACH(sp, &swtailq, sw_list) { if (sp->sw_id == cp) { sp->sw_flags |= SW_CLOSING; break; } } /* * Drop reference we were created with. Do directly since we're in a * special context where we don't have to queue the call to * swapgeom_close_ev(). */ cp->index--; destroy = ((sp != NULL) && (cp->index == 0)); if (destroy) sp->sw_id = NULL; mtx_unlock(&sw_dev_mtx); if (destroy) swapgeom_close_ev(cp, 0); } static void swapgeom_close(struct thread *td, struct swdevt *sw) { struct g_consumer *cp; mtx_lock(&sw_dev_mtx); cp = sw->sw_id; sw->sw_id = NULL; mtx_unlock(&sw_dev_mtx); /* * swapgeom_close() may be called from the biodone context, * where we cannot perform topology changes. Delegate the * work to the events thread. */ if (cp != NULL) g_waitfor_event(swapgeom_close_ev, cp, M_WAITOK, NULL); } static int swapongeom_locked(struct cdev *dev, struct vnode *vp) { struct g_provider *pp; struct g_consumer *cp; static struct g_geom *gp; struct swdevt *sp; u_long nblks; int error; pp = g_dev_getprovider(dev); if (pp == NULL) return (ENODEV); mtx_lock(&sw_dev_mtx); TAILQ_FOREACH(sp, &swtailq, sw_list) { cp = sp->sw_id; if (cp != NULL && cp->provider == pp) { mtx_unlock(&sw_dev_mtx); return (EBUSY); } } mtx_unlock(&sw_dev_mtx); if (gp == NULL) gp = g_new_geomf(&g_swap_class, "swap"); cp = g_new_consumer(gp); cp->index = 1; /* Number of active I/Os, plus one for being active. */ cp->flags |= G_CF_DIRECT_SEND | G_CF_DIRECT_RECEIVE; g_attach(cp, pp); /* * XXX: Every time you think you can improve the margin for * footshooting, somebody depends on the ability to do so: * savecore(8) wants to write to our swapdev so we cannot * set an exclusive count :-( */ error = g_access(cp, 1, 1, 0); if (error != 0) { g_detach(cp); g_destroy_consumer(cp); return (error); } nblks = pp->mediasize / DEV_BSIZE; swaponsomething(vp, cp, nblks, swapgeom_strategy, swapgeom_close, dev2udev(dev), (pp->flags & G_PF_ACCEPT_UNMAPPED) != 0 ? SW_UNMAPPED : 0); return (0); } static int swapongeom(struct vnode *vp) { int error; vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); if (vp->v_type != VCHR || VN_IS_DOOMED(vp)) { error = ENOENT; } else { g_topology_lock(); error = swapongeom_locked(vp->v_rdev, vp); g_topology_unlock(); } VOP_UNLOCK(vp); return (error); } /* * VNODE backend * * This is used mainly for network filesystem (read: probably only tested * with NFS) swapfiles. * */ static void swapdev_strategy(struct buf *bp, struct swdevt *sp) { struct vnode *vp2; bp->b_blkno = ctodb(bp->b_blkno - sp->sw_first); vp2 = sp->sw_id; vhold(vp2); if (bp->b_iocmd == BIO_WRITE) { if (bp->b_bufobj) bufobj_wdrop(bp->b_bufobj); bufobj_wref(&vp2->v_bufobj); } if (bp->b_bufobj != &vp2->v_bufobj) bp->b_bufobj = &vp2->v_bufobj; bp->b_vp = vp2; bp->b_iooffset = dbtob(bp->b_blkno); bstrategy(bp); return; } static void swapdev_close(struct thread *td, struct swdevt *sp) { VOP_CLOSE(sp->sw_vp, FREAD | FWRITE, td->td_ucred, td); vrele(sp->sw_vp); } static int swaponvp(struct thread *td, struct vnode *vp, u_long nblks) { struct swdevt *sp; int error; if (nblks == 0) return (ENXIO); mtx_lock(&sw_dev_mtx); TAILQ_FOREACH(sp, &swtailq, sw_list) { if (sp->sw_id == vp) { mtx_unlock(&sw_dev_mtx); return (EBUSY); } } mtx_unlock(&sw_dev_mtx); (void) vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); #ifdef MAC error = mac_system_check_swapon(td->td_ucred, vp); if (error == 0) #endif error = VOP_OPEN(vp, FREAD | FWRITE, td->td_ucred, td, NULL); (void) VOP_UNLOCK(vp); if (error) return (error); swaponsomething(vp, vp, nblks, swapdev_strategy, swapdev_close, NODEV, 0); return (0); } static int sysctl_swap_async_max(SYSCTL_HANDLER_ARGS) { int error, new, n; new = nsw_wcount_async_max; error = sysctl_handle_int(oidp, &new, 0, req); if (error != 0 || req->newptr == NULL) return (error); if (new > nswbuf / 2 || new < 1) return (EINVAL); mtx_lock(&swbuf_mtx); while (nsw_wcount_async_max != new) { /* * Adjust difference. If the current async count is too low, * we will need to sqeeze our update slowly in. Sleep with a * higher priority than getpbuf() to finish faster. */ n = new - nsw_wcount_async_max; if (nsw_wcount_async + n >= 0) { nsw_wcount_async += n; nsw_wcount_async_max += n; wakeup(&nsw_wcount_async); } else { nsw_wcount_async_max -= nsw_wcount_async; nsw_wcount_async = 0; msleep(&nsw_wcount_async, &swbuf_mtx, PSWP, "swpsysctl", 0); } } mtx_unlock(&swbuf_mtx); return (0); } static void swap_pager_update_writecount(vm_object_t object, vm_offset_t start, vm_offset_t end) { VM_OBJECT_WLOCK(object); KASSERT((object->flags & OBJ_ANON) == 0, ("Splittable object with writecount")); object->un_pager.swp.writemappings += (vm_ooffset_t)end - start; VM_OBJECT_WUNLOCK(object); } static void swap_pager_release_writecount(vm_object_t object, vm_offset_t start, vm_offset_t end) { VM_OBJECT_WLOCK(object); KASSERT((object->flags & OBJ_ANON) == 0, ("Splittable object with writecount")); object->un_pager.swp.writemappings -= (vm_ooffset_t)end - start; VM_OBJECT_WUNLOCK(object); } Index: head/sys/vm/vnode_pager.c =================================================================== --- head/sys/vm/vnode_pager.c (revision 364371) +++ head/sys/vm/vnode_pager.c (revision 364372) @@ -1,1607 +1,1607 @@ /*- * SPDX-License-Identifier: BSD-4-Clause * * Copyright (c) 1990 University of Utah. * Copyright (c) 1991 The Regents of the University of California. * All rights reserved. * Copyright (c) 1993, 1994 John S. Dyson * Copyright (c) 1995, David Greenman * * This code is derived from software contributed to Berkeley by * the Systems Programming Group of the University of Utah Computer * Science Department. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)vnode_pager.c 7.5 (Berkeley) 4/20/91 */ /* * Page to/from files (vnodes). */ /* * TODO: * Implement VOP_GETPAGES/PUTPAGES interface for filesystems. Will * greatly re-simplify the vnode_pager. */ #include __FBSDID("$FreeBSD$"); #include "opt_vm.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static int vnode_pager_addr(struct vnode *vp, vm_ooffset_t address, daddr_t *rtaddress, int *run); static int vnode_pager_input_smlfs(vm_object_t object, vm_page_t m); static int vnode_pager_input_old(vm_object_t object, vm_page_t m); static void vnode_pager_dealloc(vm_object_t); static int vnode_pager_getpages(vm_object_t, vm_page_t *, int, int *, int *); static int vnode_pager_getpages_async(vm_object_t, vm_page_t *, int, int *, int *, vop_getpages_iodone_t, void *); static void vnode_pager_putpages(vm_object_t, vm_page_t *, int, int, int *); static boolean_t vnode_pager_haspage(vm_object_t, vm_pindex_t, int *, int *); static vm_object_t vnode_pager_alloc(void *, vm_ooffset_t, vm_prot_t, vm_ooffset_t, struct ucred *cred); static int vnode_pager_generic_getpages_done(struct buf *); static void vnode_pager_generic_getpages_done_async(struct buf *); static void vnode_pager_update_writecount(vm_object_t, vm_offset_t, vm_offset_t); static void vnode_pager_release_writecount(vm_object_t, vm_offset_t, vm_offset_t); struct pagerops vnodepagerops = { .pgo_alloc = vnode_pager_alloc, .pgo_dealloc = vnode_pager_dealloc, .pgo_getpages = vnode_pager_getpages, .pgo_getpages_async = vnode_pager_getpages_async, .pgo_putpages = vnode_pager_putpages, .pgo_haspage = vnode_pager_haspage, .pgo_update_writecount = vnode_pager_update_writecount, .pgo_release_writecount = vnode_pager_release_writecount, }; static struct domainset *vnode_domainset = NULL; SYSCTL_PROC(_debug, OID_AUTO, vnode_domainset, CTLTYPE_STRING | CTLFLAG_MPSAFE | CTLFLAG_RW, &vnode_domainset, 0, sysctl_handle_domainset, "A", "Default vnode NUMA policy"); static int nvnpbufs; SYSCTL_INT(_vm, OID_AUTO, vnode_pbufs, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &nvnpbufs, 0, "number of physical buffers allocated for vnode pager"); static uma_zone_t vnode_pbuf_zone; static void vnode_pager_init(void *dummy) { #ifdef __LP64__ nvnpbufs = nswbuf * 2; #else nvnpbufs = nswbuf / 2; #endif TUNABLE_INT_FETCH("vm.vnode_pbufs", &nvnpbufs); vnode_pbuf_zone = pbuf_zsecond_create("vnpbuf", nvnpbufs); } SYSINIT(vnode_pager, SI_SUB_CPU, SI_ORDER_ANY, vnode_pager_init, NULL); /* Create the VM system backing object for this vnode */ int vnode_create_vobject(struct vnode *vp, off_t isize, struct thread *td) { vm_object_t object; vm_ooffset_t size = isize; struct vattr va; bool last; - if (!vn_isdisk(vp, NULL) && vn_canvmio(vp) == FALSE) + if (!vn_isdisk(vp) && vn_canvmio(vp) == FALSE) return (0); object = vp->v_object; if (object != NULL) return (0); if (size == 0) { - if (vn_isdisk(vp, NULL)) { + if (vn_isdisk(vp)) { size = IDX_TO_OFF(INT_MAX); } else { if (VOP_GETATTR(vp, &va, td->td_ucred)) return (0); size = va.va_size; } } object = vnode_pager_alloc(vp, size, 0, 0, td->td_ucred); /* * Dereference the reference we just created. This assumes * that the object is associated with the vp. We still have * to serialize with vnode_pager_dealloc() for the last * potential reference. */ VM_OBJECT_RLOCK(object); last = refcount_release(&object->ref_count); VM_OBJECT_RUNLOCK(object); if (last) vrele(vp); KASSERT(vp->v_object != NULL, ("vnode_create_vobject: NULL object")); return (0); } void vnode_destroy_vobject(struct vnode *vp) { struct vm_object *obj; obj = vp->v_object; if (obj == NULL || obj->handle != vp) return; ASSERT_VOP_ELOCKED(vp, "vnode_destroy_vobject"); VM_OBJECT_WLOCK(obj); MPASS(obj->type == OBJT_VNODE); umtx_shm_object_terminated(obj); if (obj->ref_count == 0) { KASSERT((obj->flags & OBJ_DEAD) == 0, ("vnode_destroy_vobject: Terminating dead object")); vm_object_set_flag(obj, OBJ_DEAD); /* * Clean pages and flush buffers. */ vm_object_page_clean(obj, 0, 0, OBJPC_SYNC); VM_OBJECT_WUNLOCK(obj); vinvalbuf(vp, V_SAVE, 0, 0); BO_LOCK(&vp->v_bufobj); vp->v_bufobj.bo_flag |= BO_DEAD; BO_UNLOCK(&vp->v_bufobj); VM_OBJECT_WLOCK(obj); vm_object_terminate(obj); } else { /* * Woe to the process that tries to page now :-). */ vm_pager_deallocate(obj); VM_OBJECT_WUNLOCK(obj); } KASSERT(vp->v_object == NULL, ("vp %p obj %p", vp, vp->v_object)); } /* * Allocate (or lookup) pager for a vnode. * Handle is a vnode pointer. */ vm_object_t vnode_pager_alloc(void *handle, vm_ooffset_t size, vm_prot_t prot, vm_ooffset_t offset, struct ucred *cred) { vm_object_t object; struct vnode *vp; /* * Pageout to vnode, no can do yet. */ if (handle == NULL) return (NULL); vp = (struct vnode *)handle; ASSERT_VOP_LOCKED(vp, "vnode_pager_alloc"); VNPASS(vp->v_usecount > 0, vp); retry: object = vp->v_object; if (object == NULL) { /* * Add an object of the appropriate size */ object = vm_object_allocate(OBJT_VNODE, OFF_TO_IDX(round_page(size))); object->un_pager.vnp.vnp_size = size; object->un_pager.vnp.writemappings = 0; object->domain.dr_policy = vnode_domainset; object->handle = handle; if ((vp->v_vflag & VV_VMSIZEVNLOCK) != 0) { VM_OBJECT_WLOCK(object); vm_object_set_flag(object, OBJ_SIZEVNLOCK); VM_OBJECT_WUNLOCK(object); } VI_LOCK(vp); if (vp->v_object != NULL) { /* * Object has been created while we were allocating. */ VI_UNLOCK(vp); VM_OBJECT_WLOCK(object); KASSERT(object->ref_count == 1, ("leaked ref %p %d", object, object->ref_count)); object->type = OBJT_DEAD; refcount_init(&object->ref_count, 0); VM_OBJECT_WUNLOCK(object); vm_object_destroy(object); goto retry; } vp->v_object = object; VI_UNLOCK(vp); vrefact(vp); } else { vm_object_reference(object); #if VM_NRESERVLEVEL > 0 if ((object->flags & OBJ_COLORED) == 0) { VM_OBJECT_WLOCK(object); vm_object_color(object, 0); VM_OBJECT_WUNLOCK(object); } #endif } return (object); } /* * The object must be locked. */ static void vnode_pager_dealloc(vm_object_t object) { struct vnode *vp; int refs; vp = object->handle; if (vp == NULL) panic("vnode_pager_dealloc: pager already dealloced"); VM_OBJECT_ASSERT_WLOCKED(object); vm_object_pip_wait(object, "vnpdea"); refs = object->ref_count; object->handle = NULL; object->type = OBJT_DEAD; ASSERT_VOP_ELOCKED(vp, "vnode_pager_dealloc"); if (object->un_pager.vnp.writemappings > 0) { object->un_pager.vnp.writemappings = 0; VOP_ADD_WRITECOUNT_CHECKED(vp, -1); CTR3(KTR_VFS, "%s: vp %p v_writecount decreased to %d", __func__, vp, vp->v_writecount); } vp->v_object = NULL; VI_LOCK(vp); /* * vm_map_entry_set_vnode_text() cannot reach this vnode by * following object->handle. Clear all text references now. * This also clears the transient references from * kern_execve(), which is fine because dead_vnodeops uses nop * for VOP_UNSET_TEXT(). */ if (vp->v_writecount < 0) vp->v_writecount = 0; VI_UNLOCK(vp); VM_OBJECT_WUNLOCK(object); if (refs > 0) vunref(vp); VM_OBJECT_WLOCK(object); } static boolean_t vnode_pager_haspage(vm_object_t object, vm_pindex_t pindex, int *before, int *after) { struct vnode *vp = object->handle; daddr_t bn; uintptr_t lockstate; int err; daddr_t reqblock; int poff; int bsize; int pagesperblock, blocksperpage; VM_OBJECT_ASSERT_LOCKED(object); /* * If no vp or vp is doomed or marked transparent to VM, we do not * have the page. */ if (vp == NULL || VN_IS_DOOMED(vp)) return FALSE; /* * If the offset is beyond end of file we do * not have the page. */ if (IDX_TO_OFF(pindex) >= object->un_pager.vnp.vnp_size) return FALSE; bsize = vp->v_mount->mnt_stat.f_iosize; pagesperblock = bsize / PAGE_SIZE; blocksperpage = 0; if (pagesperblock > 0) { reqblock = pindex / pagesperblock; } else { blocksperpage = (PAGE_SIZE / bsize); reqblock = pindex * blocksperpage; } lockstate = VM_OBJECT_DROP(object); err = VOP_BMAP(vp, reqblock, NULL, &bn, after, before); VM_OBJECT_PICKUP(object, lockstate); if (err) return TRUE; if (bn == -1) return FALSE; if (pagesperblock > 0) { poff = pindex - (reqblock * pagesperblock); if (before) { *before *= pagesperblock; *before += poff; } if (after) { /* * The BMAP vop can report a partial block in the * 'after', but must not report blocks after EOF. * Assert the latter, and truncate 'after' in case * of the former. */ KASSERT((reqblock + *after) * pagesperblock < roundup2(object->size, pagesperblock), ("%s: reqblock %jd after %d size %ju", __func__, (intmax_t )reqblock, *after, (uintmax_t )object->size)); *after *= pagesperblock; *after += pagesperblock - (poff + 1); if (pindex + *after >= object->size) *after = object->size - 1 - pindex; } } else { if (before) { *before /= blocksperpage; } if (after) { *after /= blocksperpage; } } return TRUE; } /* * Lets the VM system know about a change in size for a file. * We adjust our own internal size and flush any cached pages in * the associated object that are affected by the size change. * * Note: this routine may be invoked as a result of a pager put * operation (possibly at object termination time), so we must be careful. */ void vnode_pager_setsize(struct vnode *vp, vm_ooffset_t nsize) { vm_object_t object; vm_page_t m; vm_pindex_t nobjsize; if ((object = vp->v_object) == NULL) return; #ifdef DEBUG_VFS_LOCKS { struct mount *mp; mp = vp->v_mount; if (mp != NULL && (mp->mnt_kern_flag & MNTK_VMSETSIZE_BUG) == 0) assert_vop_elocked(vp, "vnode_pager_setsize and not locked vnode"); } #endif VM_OBJECT_WLOCK(object); if (object->type == OBJT_DEAD) { VM_OBJECT_WUNLOCK(object); return; } KASSERT(object->type == OBJT_VNODE, ("not vnode-backed object %p", object)); if (nsize == object->un_pager.vnp.vnp_size) { /* * Hasn't changed size */ VM_OBJECT_WUNLOCK(object); return; } nobjsize = OFF_TO_IDX(nsize + PAGE_MASK); if (nsize < object->un_pager.vnp.vnp_size) { /* * File has shrunk. Toss any cached pages beyond the new EOF. */ if (nobjsize < object->size) vm_object_page_remove(object, nobjsize, object->size, 0); /* * this gets rid of garbage at the end of a page that is now * only partially backed by the vnode. * * XXX for some reason (I don't know yet), if we take a * completely invalid page and mark it partially valid * it can screw up NFS reads, so we don't allow the case. */ if (!(nsize & PAGE_MASK)) goto out; m = vm_page_grab(object, OFF_TO_IDX(nsize), VM_ALLOC_NOCREAT); if (m == NULL) goto out; if (!vm_page_none_valid(m)) { int base = (int)nsize & PAGE_MASK; int size = PAGE_SIZE - base; /* * Clear out partial-page garbage in case * the page has been mapped. */ pmap_zero_page_area(m, base, size); /* * Update the valid bits to reflect the blocks that * have been zeroed. Some of these valid bits may * have already been set. */ vm_page_set_valid_range(m, base, size); /* * Round "base" to the next block boundary so that the * dirty bit for a partially zeroed block is not * cleared. */ base = roundup2(base, DEV_BSIZE); /* * Clear out partial-page dirty bits. * * note that we do not clear out the valid * bits. This would prevent bogus_page * replacement from working properly. */ vm_page_clear_dirty(m, base, PAGE_SIZE - base); } vm_page_xunbusy(m); } out: #if defined(__powerpc__) && !defined(__powerpc64__) object->un_pager.vnp.vnp_size = nsize; #else atomic_store_64(&object->un_pager.vnp.vnp_size, nsize); #endif object->size = nobjsize; VM_OBJECT_WUNLOCK(object); } /* * calculate the linear (byte) disk address of specified virtual * file address */ static int vnode_pager_addr(struct vnode *vp, vm_ooffset_t address, daddr_t *rtaddress, int *run) { int bsize; int err; daddr_t vblock; daddr_t voffset; if (address < 0) return -1; if (VN_IS_DOOMED(vp)) return -1; bsize = vp->v_mount->mnt_stat.f_iosize; vblock = address / bsize; voffset = address % bsize; err = VOP_BMAP(vp, vblock, NULL, rtaddress, run, NULL); if (err == 0) { if (*rtaddress != -1) *rtaddress += voffset / DEV_BSIZE; if (run) { *run += 1; *run *= bsize / PAGE_SIZE; *run -= voffset / PAGE_SIZE; } } return (err); } /* * small block filesystem vnode pager input */ static int vnode_pager_input_smlfs(vm_object_t object, vm_page_t m) { struct vnode *vp; struct bufobj *bo; struct buf *bp; struct sf_buf *sf; daddr_t fileaddr; vm_offset_t bsize; vm_page_bits_t bits; int error, i; error = 0; vp = object->handle; if (VN_IS_DOOMED(vp)) return VM_PAGER_BAD; bsize = vp->v_mount->mnt_stat.f_iosize; VOP_BMAP(vp, 0, &bo, 0, NULL, NULL); sf = sf_buf_alloc(m, 0); for (i = 0; i < PAGE_SIZE / bsize; i++) { vm_ooffset_t address; bits = vm_page_bits(i * bsize, bsize); if (m->valid & bits) continue; address = IDX_TO_OFF(m->pindex) + i * bsize; if (address >= object->un_pager.vnp.vnp_size) { fileaddr = -1; } else { error = vnode_pager_addr(vp, address, &fileaddr, NULL); if (error) break; } if (fileaddr != -1) { bp = uma_zalloc(vnode_pbuf_zone, M_WAITOK); /* build a minimal buffer header */ bp->b_iocmd = BIO_READ; bp->b_iodone = bdone; KASSERT(bp->b_rcred == NOCRED, ("leaking read ucred")); KASSERT(bp->b_wcred == NOCRED, ("leaking write ucred")); bp->b_rcred = crhold(curthread->td_ucred); bp->b_wcred = crhold(curthread->td_ucred); bp->b_data = (caddr_t)sf_buf_kva(sf) + i * bsize; bp->b_blkno = fileaddr; pbgetbo(bo, bp); bp->b_vp = vp; bp->b_bcount = bsize; bp->b_bufsize = bsize; bp->b_runningbufspace = bp->b_bufsize; atomic_add_long(&runningbufspace, bp->b_runningbufspace); /* do the input */ bp->b_iooffset = dbtob(bp->b_blkno); bstrategy(bp); bwait(bp, PVM, "vnsrd"); if ((bp->b_ioflags & BIO_ERROR) != 0) { KASSERT(bp->b_error != 0, ("%s: buf error but b_error == 0\n", __func__)); error = bp->b_error; } /* * free the buffer header back to the swap buffer pool */ bp->b_vp = NULL; pbrelbo(bp); uma_zfree(vnode_pbuf_zone, bp); if (error) break; } else bzero((caddr_t)sf_buf_kva(sf) + i * bsize, bsize); KASSERT((m->dirty & bits) == 0, ("vnode_pager_input_smlfs: page %p is dirty", m)); vm_page_bits_set(m, &m->valid, bits); } sf_buf_free(sf); if (error) { return VM_PAGER_ERROR; } return VM_PAGER_OK; } /* * old style vnode pager input routine */ static int vnode_pager_input_old(vm_object_t object, vm_page_t m) { struct uio auio; struct iovec aiov; int error; int size; struct sf_buf *sf; struct vnode *vp; VM_OBJECT_ASSERT_WLOCKED(object); error = 0; /* * Return failure if beyond current EOF */ if (IDX_TO_OFF(m->pindex) >= object->un_pager.vnp.vnp_size) { return VM_PAGER_BAD; } else { size = PAGE_SIZE; if (IDX_TO_OFF(m->pindex) + size > object->un_pager.vnp.vnp_size) size = object->un_pager.vnp.vnp_size - IDX_TO_OFF(m->pindex); vp = object->handle; VM_OBJECT_WUNLOCK(object); /* * Allocate a kernel virtual address and initialize so that * we can use VOP_READ/WRITE routines. */ sf = sf_buf_alloc(m, 0); aiov.iov_base = (caddr_t)sf_buf_kva(sf); aiov.iov_len = size; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_offset = IDX_TO_OFF(m->pindex); auio.uio_segflg = UIO_SYSSPACE; auio.uio_rw = UIO_READ; auio.uio_resid = size; auio.uio_td = curthread; error = VOP_READ(vp, &auio, 0, curthread->td_ucred); if (!error) { int count = size - auio.uio_resid; if (count == 0) error = EINVAL; else if (count != PAGE_SIZE) bzero((caddr_t)sf_buf_kva(sf) + count, PAGE_SIZE - count); } sf_buf_free(sf); VM_OBJECT_WLOCK(object); } KASSERT(m->dirty == 0, ("vnode_pager_input_old: page %p is dirty", m)); if (!error) vm_page_valid(m); return error ? VM_PAGER_ERROR : VM_PAGER_OK; } /* * generic vnode pager input routine */ /* * Local media VFS's that do not implement their own VOP_GETPAGES * should have their VOP_GETPAGES call to vnode_pager_generic_getpages() * to implement the previous behaviour. * * All other FS's should use the bypass to get to the local media * backing vp's VOP_GETPAGES. */ static int vnode_pager_getpages(vm_object_t object, vm_page_t *m, int count, int *rbehind, int *rahead) { struct vnode *vp; int rtval; /* Handle is stable with paging in progress. */ vp = object->handle; rtval = VOP_GETPAGES(vp, m, count, rbehind, rahead); KASSERT(rtval != EOPNOTSUPP, ("vnode_pager: FS getpages not implemented\n")); return rtval; } static int vnode_pager_getpages_async(vm_object_t object, vm_page_t *m, int count, int *rbehind, int *rahead, vop_getpages_iodone_t iodone, void *arg) { struct vnode *vp; int rtval; vp = object->handle; rtval = VOP_GETPAGES_ASYNC(vp, m, count, rbehind, rahead, iodone, arg); KASSERT(rtval != EOPNOTSUPP, ("vnode_pager: FS getpages_async not implemented\n")); return (rtval); } /* * The implementation of VOP_GETPAGES() and VOP_GETPAGES_ASYNC() for * local filesystems, where partially valid pages can only occur at * the end of file. */ int vnode_pager_local_getpages(struct vop_getpages_args *ap) { return (vnode_pager_generic_getpages(ap->a_vp, ap->a_m, ap->a_count, ap->a_rbehind, ap->a_rahead, NULL, NULL)); } int vnode_pager_local_getpages_async(struct vop_getpages_async_args *ap) { int error; error = vnode_pager_generic_getpages(ap->a_vp, ap->a_m, ap->a_count, ap->a_rbehind, ap->a_rahead, ap->a_iodone, ap->a_arg); if (error != 0 && ap->a_iodone != NULL) ap->a_iodone(ap->a_arg, ap->a_m, ap->a_count, error); return (error); } /* * This is now called from local media FS's to operate against their * own vnodes if they fail to implement VOP_GETPAGES. */ int vnode_pager_generic_getpages(struct vnode *vp, vm_page_t *m, int count, int *a_rbehind, int *a_rahead, vop_getpages_iodone_t iodone, void *arg) { vm_object_t object; struct bufobj *bo; struct buf *bp; off_t foff; #ifdef INVARIANTS off_t blkno0; #endif int bsize, pagesperblock; int error, before, after, rbehind, rahead, poff, i; int bytecount, secmask; KASSERT(vp->v_type != VCHR && vp->v_type != VBLK, ("%s does not support devices", __func__)); if (VN_IS_DOOMED(vp)) return (VM_PAGER_BAD); object = vp->v_object; foff = IDX_TO_OFF(m[0]->pindex); bsize = vp->v_mount->mnt_stat.f_iosize; pagesperblock = bsize / PAGE_SIZE; KASSERT(foff < object->un_pager.vnp.vnp_size, ("%s: page %p offset beyond vp %p size", __func__, m[0], vp)); KASSERT(count <= nitems(bp->b_pages), ("%s: requested %d pages", __func__, count)); /* * The last page has valid blocks. Invalid part can only * exist at the end of file, and the page is made fully valid * by zeroing in vm_pager_get_pages(). */ if (!vm_page_none_valid(m[count - 1]) && --count == 0) { if (iodone != NULL) iodone(arg, m, 1, 0); return (VM_PAGER_OK); } bp = uma_zalloc(vnode_pbuf_zone, M_WAITOK); /* * Get the underlying device blocks for the file with VOP_BMAP(). * If the file system doesn't support VOP_BMAP, use old way of * getting pages via VOP_READ. */ error = VOP_BMAP(vp, foff / bsize, &bo, &bp->b_blkno, &after, &before); if (error == EOPNOTSUPP) { uma_zfree(vnode_pbuf_zone, bp); VM_OBJECT_WLOCK(object); for (i = 0; i < count; i++) { VM_CNT_INC(v_vnodein); VM_CNT_INC(v_vnodepgsin); error = vnode_pager_input_old(object, m[i]); if (error) break; } VM_OBJECT_WUNLOCK(object); return (error); } else if (error != 0) { uma_zfree(vnode_pbuf_zone, bp); return (VM_PAGER_ERROR); } /* * If the file system supports BMAP, but blocksize is smaller * than a page size, then use special small filesystem code. */ if (pagesperblock == 0) { uma_zfree(vnode_pbuf_zone, bp); for (i = 0; i < count; i++) { VM_CNT_INC(v_vnodein); VM_CNT_INC(v_vnodepgsin); error = vnode_pager_input_smlfs(object, m[i]); if (error) break; } return (error); } /* * A sparse file can be encountered only for a single page request, * which may not be preceded by call to vm_pager_haspage(). */ if (bp->b_blkno == -1) { KASSERT(count == 1, ("%s: array[%d] request to a sparse file %p", __func__, count, vp)); uma_zfree(vnode_pbuf_zone, bp); pmap_zero_page(m[0]); KASSERT(m[0]->dirty == 0, ("%s: page %p is dirty", __func__, m[0])); vm_page_valid(m[0]); return (VM_PAGER_OK); } #ifdef INVARIANTS blkno0 = bp->b_blkno; #endif bp->b_blkno += (foff % bsize) / DEV_BSIZE; /* Recalculate blocks available after/before to pages. */ poff = (foff % bsize) / PAGE_SIZE; before *= pagesperblock; before += poff; after *= pagesperblock; after += pagesperblock - (poff + 1); if (m[0]->pindex + after >= object->size) after = object->size - 1 - m[0]->pindex; KASSERT(count <= after + 1, ("%s: %d pages asked, can do only %d", __func__, count, after + 1)); after -= count - 1; /* Trim requested rbehind/rahead to possible values. */ rbehind = a_rbehind ? *a_rbehind : 0; rahead = a_rahead ? *a_rahead : 0; rbehind = min(rbehind, before); rbehind = min(rbehind, m[0]->pindex); rahead = min(rahead, after); rahead = min(rahead, object->size - m[count - 1]->pindex); /* * Check that total amount of pages fit into buf. Trim rbehind and * rahead evenly if not. */ if (rbehind + rahead + count > nitems(bp->b_pages)) { int trim, sum; trim = rbehind + rahead + count - nitems(bp->b_pages) + 1; sum = rbehind + rahead; if (rbehind == before) { /* Roundup rbehind trim to block size. */ rbehind -= roundup(trim * rbehind / sum, pagesperblock); if (rbehind < 0) rbehind = 0; } else rbehind -= trim * rbehind / sum; rahead -= trim * rahead / sum; } KASSERT(rbehind + rahead + count <= nitems(bp->b_pages), ("%s: behind %d ahead %d count %d", __func__, rbehind, rahead, count)); /* * Fill in the bp->b_pages[] array with requested and optional * read behind or read ahead pages. Read behind pages are looked * up in a backward direction, down to a first cached page. Same * for read ahead pages, but there is no need to shift the array * in case of encountering a cached page. */ i = bp->b_npages = 0; if (rbehind) { vm_pindex_t startpindex, tpindex; vm_page_t p; VM_OBJECT_WLOCK(object); startpindex = m[0]->pindex - rbehind; if ((p = TAILQ_PREV(m[0], pglist, listq)) != NULL && p->pindex >= startpindex) startpindex = p->pindex + 1; /* tpindex is unsigned; beware of numeric underflow. */ for (tpindex = m[0]->pindex - 1; tpindex >= startpindex && tpindex < m[0]->pindex; tpindex--, i++) { p = vm_page_alloc(object, tpindex, VM_ALLOC_NORMAL); if (p == NULL) { /* Shift the array. */ for (int j = 0; j < i; j++) bp->b_pages[j] = bp->b_pages[j + tpindex + 1 - startpindex]; break; } bp->b_pages[tpindex - startpindex] = p; } bp->b_pgbefore = i; bp->b_npages += i; bp->b_blkno -= IDX_TO_OFF(i) / DEV_BSIZE; } else bp->b_pgbefore = 0; /* Requested pages. */ for (int j = 0; j < count; j++, i++) bp->b_pages[i] = m[j]; bp->b_npages += count; if (rahead) { vm_pindex_t endpindex, tpindex; vm_page_t p; if (!VM_OBJECT_WOWNED(object)) VM_OBJECT_WLOCK(object); endpindex = m[count - 1]->pindex + rahead + 1; if ((p = TAILQ_NEXT(m[count - 1], listq)) != NULL && p->pindex < endpindex) endpindex = p->pindex; if (endpindex > object->size) endpindex = object->size; for (tpindex = m[count - 1]->pindex + 1; tpindex < endpindex; i++, tpindex++) { p = vm_page_alloc(object, tpindex, VM_ALLOC_NORMAL); if (p == NULL) break; bp->b_pages[i] = p; } bp->b_pgafter = i - bp->b_npages; bp->b_npages = i; } else bp->b_pgafter = 0; if (VM_OBJECT_WOWNED(object)) VM_OBJECT_WUNLOCK(object); /* Report back actual behind/ahead read. */ if (a_rbehind) *a_rbehind = bp->b_pgbefore; if (a_rahead) *a_rahead = bp->b_pgafter; #ifdef INVARIANTS KASSERT(bp->b_npages <= nitems(bp->b_pages), ("%s: buf %p overflowed", __func__, bp)); for (int j = 1, prev = 0; j < bp->b_npages; j++) { if (bp->b_pages[j] == bogus_page) continue; KASSERT(bp->b_pages[j]->pindex - bp->b_pages[prev]->pindex == j - prev, ("%s: pages array not consecutive, bp %p", __func__, bp)); prev = j; } #endif /* * Recalculate first offset and bytecount with regards to read behind. * Truncate bytecount to vnode real size and round up physical size * for real devices. */ foff = IDX_TO_OFF(bp->b_pages[0]->pindex); bytecount = bp->b_npages << PAGE_SHIFT; if ((foff + bytecount) > object->un_pager.vnp.vnp_size) bytecount = object->un_pager.vnp.vnp_size - foff; secmask = bo->bo_bsize - 1; KASSERT(secmask < PAGE_SIZE && secmask > 0, ("%s: sector size %d too large", __func__, secmask + 1)); bytecount = (bytecount + secmask) & ~secmask; /* * And map the pages to be read into the kva, if the filesystem * requires mapped buffers. */ if ((vp->v_mount->mnt_kern_flag & MNTK_UNMAPPED_BUFS) != 0 && unmapped_buf_allowed) { bp->b_data = unmapped_buf; bp->b_offset = 0; } else { bp->b_data = bp->b_kvabase; pmap_qenter((vm_offset_t)bp->b_data, bp->b_pages, bp->b_npages); } /* Build a minimal buffer header. */ bp->b_iocmd = BIO_READ; KASSERT(bp->b_rcred == NOCRED, ("leaking read ucred")); KASSERT(bp->b_wcred == NOCRED, ("leaking write ucred")); bp->b_rcred = crhold(curthread->td_ucred); bp->b_wcred = crhold(curthread->td_ucred); pbgetbo(bo, bp); bp->b_vp = vp; bp->b_bcount = bp->b_bufsize = bp->b_runningbufspace = bytecount; bp->b_iooffset = dbtob(bp->b_blkno); KASSERT(IDX_TO_OFF(m[0]->pindex - bp->b_pages[0]->pindex) == (blkno0 - bp->b_blkno) * DEV_BSIZE + IDX_TO_OFF(m[0]->pindex) % bsize, ("wrong offsets bsize %d m[0] %ju b_pages[0] %ju " "blkno0 %ju b_blkno %ju", bsize, (uintmax_t)m[0]->pindex, (uintmax_t)bp->b_pages[0]->pindex, (uintmax_t)blkno0, (uintmax_t)bp->b_blkno)); atomic_add_long(&runningbufspace, bp->b_runningbufspace); VM_CNT_INC(v_vnodein); VM_CNT_ADD(v_vnodepgsin, bp->b_npages); if (iodone != NULL) { /* async */ bp->b_pgiodone = iodone; bp->b_caller1 = arg; bp->b_iodone = vnode_pager_generic_getpages_done_async; bp->b_flags |= B_ASYNC; BUF_KERNPROC(bp); bstrategy(bp); return (VM_PAGER_OK); } else { bp->b_iodone = bdone; bstrategy(bp); bwait(bp, PVM, "vnread"); error = vnode_pager_generic_getpages_done(bp); for (i = 0; i < bp->b_npages; i++) bp->b_pages[i] = NULL; bp->b_vp = NULL; pbrelbo(bp); uma_zfree(vnode_pbuf_zone, bp); return (error != 0 ? VM_PAGER_ERROR : VM_PAGER_OK); } } static void vnode_pager_generic_getpages_done_async(struct buf *bp) { int error; error = vnode_pager_generic_getpages_done(bp); /* Run the iodone upon the requested range. */ bp->b_pgiodone(bp->b_caller1, bp->b_pages + bp->b_pgbefore, bp->b_npages - bp->b_pgbefore - bp->b_pgafter, error); for (int i = 0; i < bp->b_npages; i++) bp->b_pages[i] = NULL; bp->b_vp = NULL; pbrelbo(bp); uma_zfree(vnode_pbuf_zone, bp); } static int vnode_pager_generic_getpages_done(struct buf *bp) { vm_object_t object; off_t tfoff, nextoff; int i, error; KASSERT((bp->b_ioflags & BIO_ERROR) == 0 || bp->b_error != 0, ("%s: buf error but b_error == 0\n", __func__)); error = (bp->b_ioflags & BIO_ERROR) != 0 ? bp->b_error : 0; object = bp->b_vp->v_object; if (error == 0 && bp->b_bcount != bp->b_npages * PAGE_SIZE) { if (!buf_mapped(bp)) { bp->b_data = bp->b_kvabase; pmap_qenter((vm_offset_t)bp->b_data, bp->b_pages, bp->b_npages); } bzero(bp->b_data + bp->b_bcount, PAGE_SIZE * bp->b_npages - bp->b_bcount); } if (buf_mapped(bp)) { pmap_qremove((vm_offset_t)bp->b_data, bp->b_npages); bp->b_data = unmapped_buf; } /* * If the read failed, we must free any read ahead/behind pages here. * The requested pages are freed by the caller (for sync requests) * or by the bp->b_pgiodone callback (for async requests). */ if (error != 0) { VM_OBJECT_WLOCK(object); for (i = 0; i < bp->b_pgbefore; i++) vm_page_free_invalid(bp->b_pages[i]); for (i = bp->b_npages - bp->b_pgafter; i < bp->b_npages; i++) vm_page_free_invalid(bp->b_pages[i]); VM_OBJECT_WUNLOCK(object); return (error); } /* Read lock to protect size. */ VM_OBJECT_RLOCK(object); for (i = 0, tfoff = IDX_TO_OFF(bp->b_pages[0]->pindex); i < bp->b_npages; i++, tfoff = nextoff) { vm_page_t mt; nextoff = tfoff + PAGE_SIZE; mt = bp->b_pages[i]; if (mt == bogus_page) continue; if (nextoff <= object->un_pager.vnp.vnp_size) { /* * Read filled up entire page. */ vm_page_valid(mt); KASSERT(mt->dirty == 0, ("%s: page %p is dirty", __func__, mt)); KASSERT(!pmap_page_is_mapped(mt), ("%s: page %p is mapped", __func__, mt)); } else { /* * Read did not fill up entire page. * * Currently we do not set the entire page valid, * we just try to clear the piece that we couldn't * read. */ vm_page_set_valid_range(mt, 0, object->un_pager.vnp.vnp_size - tfoff); KASSERT((mt->dirty & vm_page_bits(0, object->un_pager.vnp.vnp_size - tfoff)) == 0, ("%s: page %p is dirty", __func__, mt)); } if (i < bp->b_pgbefore || i >= bp->b_npages - bp->b_pgafter) vm_page_readahead_finish(mt); } VM_OBJECT_RUNLOCK(object); return (error); } /* * EOPNOTSUPP is no longer legal. For local media VFS's that do not * implement their own VOP_PUTPAGES, their VOP_PUTPAGES should call to * vnode_pager_generic_putpages() to implement the previous behaviour. * * All other FS's should use the bypass to get to the local media * backing vp's VOP_PUTPAGES. */ static void vnode_pager_putpages(vm_object_t object, vm_page_t *m, int count, int flags, int *rtvals) { int rtval; struct vnode *vp; int bytes = count * PAGE_SIZE; /* * Force synchronous operation if we are extremely low on memory * to prevent a low-memory deadlock. VOP operations often need to * allocate more memory to initiate the I/O ( i.e. do a BMAP * operation ). The swapper handles the case by limiting the amount * of asynchronous I/O, but that sort of solution doesn't scale well * for the vnode pager without a lot of work. * * Also, the backing vnode's iodone routine may not wake the pageout * daemon up. This should be probably be addressed XXX. */ if (vm_page_count_min()) flags |= VM_PAGER_PUT_SYNC; /* * Call device-specific putpages function */ vp = object->handle; VM_OBJECT_WUNLOCK(object); rtval = VOP_PUTPAGES(vp, m, bytes, flags, rtvals); KASSERT(rtval != EOPNOTSUPP, ("vnode_pager: stale FS putpages\n")); VM_OBJECT_WLOCK(object); } static int vn_off2bidx(vm_ooffset_t offset) { return ((offset & PAGE_MASK) / DEV_BSIZE); } static bool vn_dirty_blk(vm_page_t m, vm_ooffset_t offset) { KASSERT(IDX_TO_OFF(m->pindex) <= offset && offset < IDX_TO_OFF(m->pindex + 1), ("page %p pidx %ju offset %ju", m, (uintmax_t)m->pindex, (uintmax_t)offset)); return ((m->dirty & ((vm_page_bits_t)1 << vn_off2bidx(offset))) != 0); } /* * This is now called from local media FS's to operate against their * own vnodes if they fail to implement VOP_PUTPAGES. * * This is typically called indirectly via the pageout daemon and * clustering has already typically occurred, so in general we ask the * underlying filesystem to write the data out asynchronously rather * then delayed. */ int vnode_pager_generic_putpages(struct vnode *vp, vm_page_t *ma, int bytecount, int flags, int *rtvals) { vm_object_t object; vm_page_t m; vm_ooffset_t maxblksz, next_offset, poffset, prev_offset; struct uio auio; struct iovec aiov; off_t prev_resid, wrsz; int count, error, i, maxsize, ncount, pgoff, ppscheck; bool in_hole; static struct timeval lastfail; static int curfail; object = vp->v_object; count = bytecount / PAGE_SIZE; for (i = 0; i < count; i++) rtvals[i] = VM_PAGER_ERROR; if ((int64_t)ma[0]->pindex < 0) { printf("vnode_pager_generic_putpages: " "attempt to write meta-data 0x%jx(%lx)\n", (uintmax_t)ma[0]->pindex, (u_long)ma[0]->dirty); rtvals[0] = VM_PAGER_BAD; return (VM_PAGER_BAD); } maxsize = count * PAGE_SIZE; ncount = count; poffset = IDX_TO_OFF(ma[0]->pindex); /* * If the page-aligned write is larger then the actual file we * have to invalidate pages occurring beyond the file EOF. However, * there is an edge case where a file may not be page-aligned where * the last page is partially invalid. In this case the filesystem * may not properly clear the dirty bits for the entire page (which * could be VM_PAGE_BITS_ALL due to the page having been mmap()d). * With the page busied we are free to fix up the dirty bits here. * * We do not under any circumstances truncate the valid bits, as * this will screw up bogus page replacement. */ VM_OBJECT_RLOCK(object); if (maxsize + poffset > object->un_pager.vnp.vnp_size) { if (object->un_pager.vnp.vnp_size > poffset) { maxsize = object->un_pager.vnp.vnp_size - poffset; ncount = btoc(maxsize); if ((pgoff = (int)maxsize & PAGE_MASK) != 0) { pgoff = roundup2(pgoff, DEV_BSIZE); /* * If the page is busy and the following * conditions hold, then the page's dirty * field cannot be concurrently changed by a * pmap operation. */ m = ma[ncount - 1]; vm_page_assert_sbusied(m); KASSERT(!pmap_page_is_write_mapped(m), ("vnode_pager_generic_putpages: page %p is not read-only", m)); MPASS(m->dirty != 0); vm_page_clear_dirty(m, pgoff, PAGE_SIZE - pgoff); } } else { maxsize = 0; ncount = 0; } for (i = ncount; i < count; i++) rtvals[i] = VM_PAGER_BAD; } VM_OBJECT_RUNLOCK(object); auio.uio_iov = &aiov; auio.uio_segflg = UIO_NOCOPY; auio.uio_rw = UIO_WRITE; auio.uio_td = NULL; maxblksz = roundup2(poffset + maxsize, DEV_BSIZE); for (prev_offset = poffset; prev_offset < maxblksz;) { /* Skip clean blocks. */ for (in_hole = true; in_hole && prev_offset < maxblksz;) { m = ma[OFF_TO_IDX(prev_offset - poffset)]; for (i = vn_off2bidx(prev_offset); i < sizeof(vm_page_bits_t) * NBBY && prev_offset < maxblksz; i++) { if (vn_dirty_blk(m, prev_offset)) { in_hole = false; break; } prev_offset += DEV_BSIZE; } } if (in_hole) goto write_done; /* Find longest run of dirty blocks. */ for (next_offset = prev_offset; next_offset < maxblksz;) { m = ma[OFF_TO_IDX(next_offset - poffset)]; for (i = vn_off2bidx(next_offset); i < sizeof(vm_page_bits_t) * NBBY && next_offset < maxblksz; i++) { if (!vn_dirty_blk(m, next_offset)) goto start_write; next_offset += DEV_BSIZE; } } start_write: if (next_offset > poffset + maxsize) next_offset = poffset + maxsize; /* * Getting here requires finding a dirty block in the * 'skip clean blocks' loop. */ MPASS(prev_offset < next_offset); aiov.iov_base = NULL; auio.uio_iovcnt = 1; auio.uio_offset = prev_offset; prev_resid = auio.uio_resid = aiov.iov_len = next_offset - prev_offset; error = VOP_WRITE(vp, &auio, vnode_pager_putpages_ioflags(flags), curthread->td_ucred); wrsz = prev_resid - auio.uio_resid; if (wrsz == 0) { if (ppsratecheck(&lastfail, &curfail, 1) != 0) { vn_printf(vp, "vnode_pager_putpages: " "zero-length write at %ju resid %zd\n", auio.uio_offset, auio.uio_resid); } break; } /* Adjust the starting offset for next iteration. */ prev_offset += wrsz; MPASS(auio.uio_offset == prev_offset); ppscheck = 0; if (error != 0 && (ppscheck = ppsratecheck(&lastfail, &curfail, 1)) != 0) vn_printf(vp, "vnode_pager_putpages: I/O error %d\n", error); if (auio.uio_resid != 0 && (ppscheck != 0 || ppsratecheck(&lastfail, &curfail, 1) != 0)) vn_printf(vp, "vnode_pager_putpages: residual I/O %zd " "at %ju\n", auio.uio_resid, (uintmax_t)ma[0]->pindex); if (error != 0 || auio.uio_resid != 0) break; } write_done: /* Mark completely processed pages. */ for (i = 0; i < OFF_TO_IDX(prev_offset - poffset); i++) rtvals[i] = VM_PAGER_OK; /* Mark partial EOF page. */ if (prev_offset == poffset + maxsize && (prev_offset & PAGE_MASK) != 0) rtvals[i++] = VM_PAGER_OK; /* Unwritten pages in range, free bonus if the page is clean. */ for (; i < ncount; i++) rtvals[i] = ma[i]->dirty == 0 ? VM_PAGER_OK : VM_PAGER_ERROR; VM_CNT_ADD(v_vnodepgsout, i); VM_CNT_INC(v_vnodeout); return (rtvals[0]); } int vnode_pager_putpages_ioflags(int pager_flags) { int ioflags; /* * Pageouts are already clustered, use IO_ASYNC to force a * bawrite() rather then a bdwrite() to prevent paging I/O * from saturating the buffer cache. Dummy-up the sequential * heuristic to cause large ranges to cluster. If neither * IO_SYNC or IO_ASYNC is set, the system decides how to * cluster. */ ioflags = IO_VMIO; if ((pager_flags & (VM_PAGER_PUT_SYNC | VM_PAGER_PUT_INVAL)) != 0) ioflags |= IO_SYNC; else if ((pager_flags & VM_PAGER_CLUSTER_OK) == 0) ioflags |= IO_ASYNC; ioflags |= (pager_flags & VM_PAGER_PUT_INVAL) != 0 ? IO_INVAL: 0; ioflags |= (pager_flags & VM_PAGER_PUT_NOREUSE) != 0 ? IO_NOREUSE : 0; ioflags |= IO_SEQMAX << IO_SEQSHIFT; return (ioflags); } /* * vnode_pager_undirty_pages(). * * A helper to mark pages as clean after pageout that was possibly * done with a short write. The lpos argument specifies the page run * length in bytes, and the written argument specifies how many bytes * were actually written. eof is the offset past the last valid byte * in the vnode using the absolute file position of the first byte in * the run as the base from which it is computed. */ void vnode_pager_undirty_pages(vm_page_t *ma, int *rtvals, int written, off_t eof, int lpos) { vm_object_t obj; int i, pos, pos_devb; if (written == 0 && eof >= lpos) return; obj = ma[0]->object; for (i = 0, pos = 0; pos < written; i++, pos += PAGE_SIZE) { if (pos < trunc_page(written)) { rtvals[i] = VM_PAGER_OK; vm_page_undirty(ma[i]); } else { /* Partially written page. */ rtvals[i] = VM_PAGER_AGAIN; vm_page_clear_dirty(ma[i], 0, written & PAGE_MASK); } } if (eof >= lpos) /* avoid truncation */ return; for (pos = eof, i = OFF_TO_IDX(trunc_page(pos)); pos < lpos; i++) { if (pos != trunc_page(pos)) { /* * The page contains the last valid byte in * the vnode, mark the rest of the page as * clean, potentially making the whole page * clean. */ pos_devb = roundup2(pos & PAGE_MASK, DEV_BSIZE); vm_page_clear_dirty(ma[i], pos_devb, PAGE_SIZE - pos_devb); /* * If the page was cleaned, report the pageout * on it as successful. msync() no longer * needs to write out the page, endlessly * creating write requests and dirty buffers. */ if (ma[i]->dirty == 0) rtvals[i] = VM_PAGER_OK; pos = round_page(pos); } else { /* vm_pageout_flush() clears dirty */ rtvals[i] = VM_PAGER_BAD; pos += PAGE_SIZE; } } } static void vnode_pager_update_writecount(vm_object_t object, vm_offset_t start, vm_offset_t end) { struct vnode *vp; vm_ooffset_t old_wm; VM_OBJECT_WLOCK(object); if (object->type != OBJT_VNODE) { VM_OBJECT_WUNLOCK(object); return; } old_wm = object->un_pager.vnp.writemappings; object->un_pager.vnp.writemappings += (vm_ooffset_t)end - start; vp = object->handle; if (old_wm == 0 && object->un_pager.vnp.writemappings != 0) { ASSERT_VOP_LOCKED(vp, "v_writecount inc"); VOP_ADD_WRITECOUNT_CHECKED(vp, 1); CTR3(KTR_VFS, "%s: vp %p v_writecount increased to %d", __func__, vp, vp->v_writecount); } else if (old_wm != 0 && object->un_pager.vnp.writemappings == 0) { ASSERT_VOP_LOCKED(vp, "v_writecount dec"); VOP_ADD_WRITECOUNT_CHECKED(vp, -1); CTR3(KTR_VFS, "%s: vp %p v_writecount decreased to %d", __func__, vp, vp->v_writecount); } VM_OBJECT_WUNLOCK(object); } static void vnode_pager_release_writecount(vm_object_t object, vm_offset_t start, vm_offset_t end) { struct vnode *vp; struct mount *mp; vm_offset_t inc; VM_OBJECT_WLOCK(object); /* * First, recheck the object type to account for the race when * the vnode is reclaimed. */ if (object->type != OBJT_VNODE) { VM_OBJECT_WUNLOCK(object); return; } /* * Optimize for the case when writemappings is not going to * zero. */ inc = end - start; if (object->un_pager.vnp.writemappings != inc) { object->un_pager.vnp.writemappings -= inc; VM_OBJECT_WUNLOCK(object); return; } vp = object->handle; vhold(vp); VM_OBJECT_WUNLOCK(object); mp = NULL; vn_start_write(vp, &mp, V_WAIT); vn_lock(vp, LK_SHARED | LK_RETRY); /* * Decrement the object's writemappings, by swapping the start * and end arguments for vnode_pager_update_writecount(). If * there was not a race with vnode reclaimation, then the * vnode's v_writecount is decremented. */ vnode_pager_update_writecount(object, end, start); VOP_UNLOCK(vp); vdrop(vp); if (mp != NULL) vn_finished_write(mp); }