Index: include/unistd.h =================================================================== --- include/unistd.h +++ include/unistd.h @@ -384,6 +384,7 @@ /* ISO/IEC 9945-1: 1996 */ #if __POSIX_VISIBLE >= 199506 || __XSI_VISIBLE int fsync(int); +int fdatasync(int); /* * ftruncate() was in the POSIX Realtime Extension (it's used for shared Index: lib/libc/sys/Symbol.map =================================================================== --- lib/libc/sys/Symbol.map +++ lib/libc/sys/Symbol.map @@ -400,6 +400,10 @@ recvmmsg; }; +FBSD_1.5 { + fdatasync; +}; + FBSDprivate_1.0 { ___acl_aclcheck_fd; __sys___acl_aclcheck_fd; @@ -594,6 +598,8 @@ __sys_fstatfs; _fsync; __sys_fsync; + _fdatasync; + __sys_fdatasync; _futimes; __sys_futimes; _getaudit; Index: sys/compat/freebsd32/syscalls.master =================================================================== --- sys/compat/freebsd32/syscalls.master +++ sys/compat/freebsd32/syscalls.master @@ -1081,3 +1081,4 @@ 549 AUE_NULL NOPROTO { int numa_setaffinity(cpuwhich_t which, \ id_t id, \ const struct vm_domain_policy *policy); } +550 AUE_FSYNC NOPROTO { int fdatasync(int fd); } Index: sys/fs/msdosfs/msdosfs_vnops.c =================================================================== --- sys/fs/msdosfs/msdosfs_vnops.c +++ sys/fs/msdosfs/msdosfs_vnops.c @@ -1897,6 +1897,7 @@ .vop_close = msdosfs_close, .vop_create = msdosfs_create, .vop_fsync = msdosfs_fsync, + .vop_fdatasync = vop_stdfdatasync_buf, .vop_getattr = msdosfs_getattr, .vop_inactive = msdosfs_inactive, .vop_link = msdosfs_link, Index: sys/kern/syscalls.master =================================================================== --- sys/kern/syscalls.master +++ sys/kern/syscalls.master @@ -993,8 +993,9 @@ id_t id, \ struct vm_domain_policy_entry *policy); } 549 AUE_NULL STD { int numa_setaffinity(cpuwhich_t which, \ - id_t id, \ - const struct vm_domain_policy_entry *policy); } + id_t id, const struct \ + vm_domain_policy_entry *policy); } +550 AUE_FSYNC STD { int fdatasync(int fd); } ; Please copy any additions and changes to the following compatability tables: ; sys/compat/freebsd32/syscalls.master Index: sys/kern/vfs_default.c =================================================================== --- sys/kern/vfs_default.c +++ sys/kern/vfs_default.c @@ -83,6 +83,7 @@ static int vop_stdunset_text(struct vop_unset_text_args *ap); static int vop_stdget_writecount(struct vop_get_writecount_args *ap); static int vop_stdadd_writecount(struct vop_add_writecount_args *ap); +static int vop_stdfdatasync(struct vop_fdatasync_args *ap); static int vop_stdgetpages_async(struct vop_getpages_async_args *ap); /* @@ -111,6 +112,7 @@ .vop_bmap = vop_stdbmap, .vop_close = VOP_NULL, .vop_fsync = VOP_NULL, + .vop_fdatasync = vop_stdfdatasync, .vop_getpages = vop_stdgetpages, .vop_getpages_async = vop_stdgetpages_async, .vop_getwritemount = vop_stdgetwritemount, @@ -640,7 +642,6 @@ vop_stdfsync(ap) struct vop_fsync_args /* { struct vnode *a_vp; - struct ucred *a_cred; int a_waitfor; struct thread *a_td; } */ *ap; @@ -727,6 +728,24 @@ return (error); } +static int +vop_stdfdatasync(struct vop_fdatasync_args *ap) +{ + + return (VOP_FSYNC(ap->a_vp, MNT_WAIT, ap->a_td)); +} + +int +vop_stdfdatasync_buf(struct vop_fdatasync_args *ap) +{ + struct vop_fsync_args apf; + + apf.a_vp = ap->a_vp; + apf.a_waitfor = MNT_WAIT; + apf.a_td = ap->a_td; + return (vop_stdfsync(&apf)); +} + /* XXX Needs good comment and more info in the manpage (VOP_GETPAGES(9)). */ int vop_stdgetpages(ap) Index: sys/kern/vfs_syscalls.c =================================================================== --- sys/kern/vfs_syscalls.c +++ sys/kern/vfs_syscalls.c @@ -3354,20 +3354,8 @@ } #endif -/* - * Sync an open file. - */ -#ifndef _SYS_SYSPROTO_H_ -struct fsync_args { - int fd; -}; -#endif -int -sys_fsync(td, uap) - struct thread *td; - struct fsync_args /* { - int fd; - } */ *uap; +static int +kern_fsync(struct thread *td, int fd, bool datasync) { struct vnode *vp; struct mount *mp; @@ -3375,11 +3363,15 @@ cap_rights_t rights; int error, lock_flags; - AUDIT_ARG_FD(uap->fd); - error = getvnode(td, uap->fd, cap_rights_init(&rights, CAP_FSYNC), &fp); + AUDIT_ARG_FD(fd); + error = getvnode(td, fd, cap_rights_init(&rights, CAP_FSYNC), &fp); if (error != 0) return (error); vp = fp->f_vnode; +#if 0 + if (datasync) + /* XXXKIB: compete outstanding aio writes */; +#endif error = vn_start_write(vp, &mp, V_WAIT | PCATCH); if (error != 0) goto drop; @@ -3396,8 +3388,7 @@ vm_object_page_clean(vp->v_object, 0, 0, 0); VM_OBJECT_WUNLOCK(vp->v_object); } - error = VOP_FSYNC(vp, MNT_WAIT, td); - + error = datasync ? VOP_FDATASYNC(vp, td) : VOP_FSYNC(vp, MNT_WAIT, td); VOP_UNLOCK(vp, 0); vn_finished_write(mp); drop: @@ -3406,6 +3397,28 @@ } /* + * Sync an open file. + */ +#ifndef _SYS_SYSPROTO_H_ +struct fsync_args { + int fd; +}; +#endif +int +sys_fsync(struct thread *td, struct fsync_args *uap) +{ + + return (kern_fsync(td, uap->fd, false)); +} + +int +sys_fdatasync(struct thread *td, struct fdatasync_args *uap) +{ + + return (kern_fsync(td, uap->fd, true)); +} + +/* * Rename files. Source and destination must either both be directories, or * both not be directories. If target is a directory, it must be empty. */ Index: sys/kern/vnode_if.src =================================================================== --- sys/kern/vnode_if.src +++ sys/kern/vnode_if.src @@ -704,6 +704,14 @@ IN int inc; }; +%% fdatasync vp L L L + +vop_fdatasync { + IN struct vnode *vp; + IN struct thread *td; +}; + + # The VOPs below are spares at the end of the table to allow new VOPs to be # added in stable branches without breaking the KBI. New VOPs in HEAD should # be added above these spares. When merging a new VOP to a stable branch, Index: sys/sys/vnode.h =================================================================== --- sys/sys/vnode.h +++ sys/sys/vnode.h @@ -720,6 +720,7 @@ int vfs_write_suspend_umnt(struct mount *mp); void vnlru_free(int, struct vfsops *); int vop_stdbmap(struct vop_bmap_args *); +int vop_stdfdatasync_buf(struct vop_fdatasync_args *); int vop_stdfsync(struct vop_fsync_args *); int vop_stdgetwritemount(struct vop_getwritemount_args *); int vop_stdgetpages(struct vop_getpages_args *); Index: sys/ufs/ffs/ffs_extern.h =================================================================== --- sys/ufs/ffs/ffs_extern.h +++ sys/ufs/ffs/ffs_extern.h @@ -174,6 +174,11 @@ * deadlock when flushing snapshot inodes while holding snaplk. */ #define NO_INO_UPDT 0x00000001 +/* + * Request data sync only from ffs_syncvnode(), not touching even more + * metadata than NO_INO_UPDT. + */ +#define DATA_ONLY 0x00000002 int ffs_rdonly(struct inode *); Index: sys/ufs/ffs/ffs_vnops.c =================================================================== --- sys/ufs/ffs/ffs_vnops.c +++ sys/ufs/ffs/ffs_vnops.c @@ -103,6 +103,7 @@ extern int ffs_rawread(struct vnode *vp, struct uio *uio, int *workdone); #endif static vop_fsync_t ffs_fsync; +static vop_fdatasync_t ffs_fdatasync; static vop_lock1_t ffs_lock; static vop_read_t ffs_read; static vop_write_t ffs_write; @@ -123,6 +124,7 @@ struct vop_vector ffs_vnodeops1 = { .vop_default = &ufs_vnodeops, .vop_fsync = ffs_fsync, + .vop_fdatasync = ffs_fdatasync, .vop_getpages = vnode_pager_local_getpages, .vop_getpages_async = vnode_pager_local_getpages_async, .vop_lock1 = ffs_lock, @@ -135,6 +137,7 @@ struct vop_vector ffs_fifoops1 = { .vop_default = &ufs_fifoops, .vop_fsync = ffs_fsync, + .vop_fdatasync = ffs_fdatasync, .vop_reallocblks = ffs_reallocblks, /* XXX: really ??? */ .vop_vptofh = ffs_vptofh, }; @@ -143,6 +146,7 @@ struct vop_vector ffs_vnodeops2 = { .vop_default = &ufs_vnodeops, .vop_fsync = ffs_fsync, + .vop_fdatasync = ffs_fdatasync, .vop_getpages = vnode_pager_local_getpages, .vop_getpages_async = vnode_pager_local_getpages_async, .vop_lock1 = ffs_lock, @@ -161,6 +165,7 @@ struct vop_vector ffs_fifoops2 = { .vop_default = &ufs_fifoops, .vop_fsync = ffs_fsync, + .vop_fdatasync = ffs_fdatasync, .vop_lock1 = ffs_lock, .vop_reallocblks = ffs_reallocblks, .vop_strategy = ffsext_strategy, @@ -216,10 +221,10 @@ { struct inode *ip; struct bufobj *bo; - struct buf *bp; - struct buf *nbp; + struct buf *bp, *nbp; ufs_lbn_t lbn; - int error, wait, passes; + int error, passes; + bool still_dirty, wait; ip = VTOI(vp); ip->i_flag &= ~IN_NEEDSYNC; @@ -238,7 +243,7 @@ */ error = 0; passes = 0; - wait = 0; /* Always do an async pass first. */ + wait = false; /* Always do an async pass first. */ lbn = lblkno(ip->i_fs, (ip->i_size + ip->i_fs->fs_bsize - 1)); BO_LOCK(bo); loop: @@ -254,15 +259,23 @@ if ((bp->b_vflags & BV_SCANNED) != 0) continue; bp->b_vflags |= BV_SCANNED; - /* Flush indirects in order. */ + /* + * Flush indirects in order, if requested. + * + * Note that if only datasync is requested, we can + * skip indirect blocks when softupdates are not + * active. Otherwise we must flush them with data, + * since dependencies prevent data block writes. + */ if (waitfor == MNT_WAIT && bp->b_lblkno <= -NDADDR && - lbn_level(bp->b_lblkno) >= passes) + (lbn_level(bp->b_lblkno) >= passes || + ((flags & DATA_ONLY) != 0 && !DOINGSOFTDEP(vp)))) continue; if (bp->b_lblkno > lbn) panic("ffs_syncvnode: syncing truncated data."); if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) == 0) { BO_UNLOCK(bo); - } else if (wait != 0) { + } else if (wait) { if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, BO_LOCKPTR(bo)) != 0) { @@ -330,31 +343,59 @@ * these will be done with one sync and one async pass. */ if (bo->bo_dirty.bv_cnt > 0) { - /* Write the inode after sync passes to flush deps. */ - if (wait && DOINGSOFTDEP(vp) && (flags & NO_INO_UPDT) == 0) { - BO_UNLOCK(bo); - ffs_update(vp, 1); - BO_LOCK(bo); - } - /* switch between sync/async. */ - wait = !wait; - if (wait == 1 || ++passes < NIADDR + 2) - goto loop; + if ((flags & DATA_ONLY) != 0 && !DOINGSOFTDEP(vp)) { + /* + * For data-only sync, when not doing + * soft-updates, dirty indirect buffers are + * ignored. + */ + still_dirty = false; + TAILQ_FOREACH(bp, &bo->bo_dirty.bv_hd, b_bobufs) { + if (bp->b_lblkno > -NDADDR) { + still_dirty = true; + break; + } + } + } else + still_dirty = true; + + if (still_dirty) { + /* Write the inode after sync passes to flush deps. */ + if (wait && DOINGSOFTDEP(vp) && + (flags & NO_INO_UPDT) == 0) { + BO_UNLOCK(bo); + ffs_update(vp, 1); + BO_LOCK(bo); + } + /* switch between sync/async. */ + wait = !wait; + if (wait || ++passes < NIADDR + 2) + goto loop; #ifdef INVARIANTS - if (!vn_isdisk(vp, NULL)) - vn_printf(vp, "ffs_fsync: dirty "); + if (!vn_isdisk(vp, NULL)) + vn_printf(vp, "ffs_fsync: dirty "); #endif + } } BO_UNLOCK(bo); error = 0; - if ((flags & NO_INO_UPDT) == 0) - error = ffs_update(vp, 1); - if (DOINGSUJ(vp)) - softdep_journal_fsync(VTOI(vp)); + if ((flags & DATA_ONLY) == 0) { + if ((flags & NO_INO_UPDT) == 0) + error = ffs_update(vp, 1); + if (DOINGSUJ(vp)) + softdep_journal_fsync(VTOI(vp)); + } return (error); } static int +ffs_fdatasync(struct vop_fdatasync_args *ap) +{ + + return (ffs_syncvnode(ap->a_vp, MNT_WAIT, DATA_ONLY)); +} + +static int ffs_lock(ap) struct vop_lock1_args /* { struct vnode *a_vp;