Index: sys/amd64/include/vmparam.h =================================================================== --- sys/amd64/include/vmparam.h +++ sys/amd64/include/vmparam.h @@ -223,5 +223,6 @@ #endif #define ZERO_REGION_SIZE (2 * 1024 * 1024) /* 2MB */ +#define BITBUCKET_REGION_SIZE (2 * 1024 * 1024) /* 2MB */ #endif /* _MACHINE_VMPARAM_H_ */ Index: sys/arm/include/vmparam.h =================================================================== --- sys/arm/include/vmparam.h +++ sys/arm/include/vmparam.h @@ -164,6 +164,7 @@ extern vm_offset_t vm_max_kernel_address; #define ZERO_REGION_SIZE (64 * 1024) /* 64KB */ +#define BITBUCKET_REGION_SIZE (64 * 1024) /* 64KB */ #ifndef VM_MAX_AUTOTUNE_MAXUSERS #define VM_MAX_AUTOTUNE_MAXUSERS 384 Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h =================================================================== --- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h +++ sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h @@ -203,6 +203,7 @@ ZIO_FLAG_NOPWRITE = 1 << 26, ZIO_FLAG_REEXECUTED = 1 << 27, ZIO_FLAG_DELEGATED = 1 << 28, + ZIO_FLAG_AGGREGATED = 1 << 29 }; #define ZIO_FLAG_MUSTSUCCEED 0 @@ -447,6 +448,7 @@ void *io_orig_data; uint64_t io_size; uint64_t io_orig_size; + size_t io_vcount; /* Stuff for the vdev stack */ vdev_t *io_vd; Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_file.c =================================================================== --- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_file.c +++ sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_file.c @@ -31,6 +31,8 @@ #include #include +#include + /* * Virtual device vector for files. */ @@ -163,6 +165,7 @@ vdev_file_t *vf; vnode_t *vp; ssize_t resid; + enum uio_rw rw_op; if (!vdev_readable(vd)) { zio->io_error = SET_ERROR(ENXIO); @@ -190,9 +193,27 @@ ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE); zio->io_target_timestamp = zio_handle_io_delay(zio); - zio->io_error = vn_rdwr(zio->io_type == ZIO_TYPE_READ ? - UIO_READ : UIO_WRITE, vp, zio->io_data, zio->io_size, - zio->io_offset, UIO_SYSSPACE, 0, RLIM64_INFINITY, kcred, &resid); + rw_op = zio->io_type == ZIO_TYPE_READ ? UIO_READ : UIO_WRITE; + + if ((zio->io_flags & ZIO_FLAG_AGGREGATED) != 0) { + bus_dma_segment_t *vlist = (bus_dma_segment_t *)zio->io_data; + bus_dma_segment_t *vlist_end = vlist + zio->io_vcount; + off_t offset = zio->io_offset; + + for (; vlist < vlist_end; offset += vlist->ds_len, vlist++) { + + zio->io_error = vn_rdwr(rw_op, vp, + (void *)vlist->ds_addr, vlist->ds_len, offset, + UIO_SYSSPACE, 0, RLIM64_INFINITY, kcred, &resid); + + if (zio->io_error != 0) + break; + } + } else { + zio->io_error = vn_rdwr(rw_op, vp, zio->io_data, zio->io_size, + zio->io_offset, UIO_SYSSPACE, 0, RLIM64_INFINITY, kcred, + &resid); + } if (resid != 0 && zio->io_error == 0) zio->io_error = ENOSPC; Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c =================================================================== --- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c +++ sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c @@ -26,18 +26,23 @@ */ #include + #include #include #include #include #include #include +#include #include #include #include + #include #include +#include + /* * Virtual device vector for GEOM. */ @@ -1005,7 +1010,17 @@ case ZIO_TYPE_READ: case ZIO_TYPE_WRITE: zio->io_target_timestamp = zio_handle_io_delay(zio); - bp->bio_cmd = zio->io_type == ZIO_TYPE_READ ? BIO_READ : BIO_WRITE; + if (zio->io_type == ZIO_TYPE_READ) + bp->bio_cmd = BIO_READ; + else + bp->bio_cmd = BIO_WRITE; + + if ((zio->io_flags & ZIO_FLAG_AGGREGATED) != 0) { + /* Tell bio we're passing in a busdma segment list. */ + bp->bio_flags |= BIO_VLIST; + bp->bio_ma_n = zio->io_vcount; + } + bp->bio_data = zio->io_data; bp->bio_offset = zio->io_offset; bp->bio_length = zio->io_size; Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c =================================================================== --- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c +++ sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c @@ -34,6 +34,10 @@ #include #include #include +#include + +#include +#include /* * ZFS I/O Scheduler @@ -171,7 +175,7 @@ * we include spans of optional I/Os to aid aggregation at the disk even when * they aren't able to help us aggregate at this level. */ -int zfs_vdev_aggregation_limit = SPA_OLD_MAXBLOCKSIZE; +int zfs_vdev_aggregation_limit = MAXPHYS; int zfs_vdev_read_gap_limit = 32 << 10; int zfs_vdev_write_gap_limit = 4 << 10; @@ -223,7 +227,7 @@ #undef ZFS_VDEV_QUEUE_KNOB -SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, aggregation_limit, CTLFLAG_RWTUN, +SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, aggregation_limit, CTLFLAG_RDTUN, &zfs_vdev_aggregation_limit, 0, "I/O requests are aggregated up to this size"); SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, read_gap_limit, CTLFLAG_RWTUN, @@ -476,15 +480,7 @@ static void vdev_queue_agg_io_done(zio_t *aio) { - if (aio->io_type == ZIO_TYPE_READ) { - zio_t *pio; - while ((pio = zio_walk_parents(aio)) != NULL) { - bcopy((char *)aio->io_data + (pio->io_offset - - aio->io_offset), pio->io_data, pio->io_size); - } - } - - zio_buf_free(aio->io_data, aio->io_size); + zio_buf_free(aio->io_data, (uintptr_t)aio->io_private); } static int @@ -621,11 +617,16 @@ vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio) { zio_t *first, *last, *aio, *dio, *mandatory, *nio; + bus_dma_segment_t *vlist; + uintptr_t vlist_size; uint64_t maxgap = 0; + uint64_t max_opt_io = 0; uint64_t size; boolean_t stretch; avl_tree_t *t; enum zio_flag flags; + size_t nios; + size_t noptios; ASSERT(MUTEX_HELD(&vq->vq_lock)); @@ -633,9 +634,15 @@ return (NULL); first = last = zio; + nios = 1; + noptios = 0; - if (zio->io_type == ZIO_TYPE_READ) + if (zio->io_type == ZIO_TYPE_READ) { maxgap = zfs_vdev_read_gap_limit; + max_opt_io = BITBUCKET_REGION_SIZE; + } else { + max_opt_io = ZERO_REGION_SIZE; + } /* * We can aggregate I/Os that are sufficiently adjacent and of @@ -663,14 +670,20 @@ IO_SPAN(dio, last) <= zfs_vdev_aggregation_limit && IO_GAP(dio, first) <= maxgap) { first = dio; - if (mandatory == NULL && !(first->io_flags & ZIO_FLAG_OPTIONAL)) + if ((first->io_flags & ZIO_FLAG_OPTIONAL) != 0) { + noptios += first->io_size / max_opt_io; + } else if (mandatory == NULL) { mandatory = first; + } + nios++; } /* * Skip any initial optional I/Os. */ while ((first->io_flags & ZIO_FLAG_OPTIONAL) && first != last) { + noptios -= first->io_size / max_opt_io; + nios--; first = AVL_NEXT(t, first); ASSERT(first != NULL); } @@ -683,8 +696,12 @@ IO_SPAN(first, dio) <= zfs_vdev_aggregation_limit && IO_GAP(last, dio) <= maxgap) { last = dio; - if (!(last->io_flags & ZIO_FLAG_OPTIONAL)) + if ((last->io_flags & ZIO_FLAG_OPTIONAL) != 0) { + noptios += last->io_size / max_opt_io; + } else { mandatory = last; + } + nios++; } /* @@ -704,7 +721,7 @@ IO_GAP(nio, dio) == 0 && IO_GAP(mandatory, dio) <= zfs_vdev_write_gap_limit) { nio = dio; - if (!(nio->io_flags & ZIO_FLAG_OPTIONAL)) { + if ((nio->io_flags & ZIO_FLAG_OPTIONAL) != 0) { stretch = B_TRUE; break; } @@ -718,6 +735,8 @@ } else { while (last != mandatory && last != first) { ASSERT(last->io_flags & ZIO_FLAG_OPTIONAL); + noptios -= last->io_size / max_opt_io; + nios--; last = AVL_PREV(t, last); ASSERT(last != NULL); } @@ -729,33 +748,64 @@ size = IO_SPAN(first, last); ASSERT3U(size, <=, zfs_vdev_aggregation_limit); + /* + * Construct a vector with space for nios and any gaps between them. + * Add segments for cases where the option I/O buffer must be repeated + * due to its size being smaller than zio->io_size. + */ + vlist_size = (((2 * nios) - 1) + noptios) * sizeof(*vlist); + vlist = zio_buf_alloc(vlist_size); aio = zio_vdev_delegated_io(first->io_vd, first->io_offset, - zio_buf_alloc(size), size, first->io_type, zio->io_priority, - flags | ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE, - vdev_queue_agg_io_done, NULL); + vlist, size, first->io_type, zio->io_priority, flags | + ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE | ZIO_FLAG_AGGREGATED, + vdev_queue_agg_io_done, (void *)vlist_size); aio->io_timestamp = first->io_timestamp; nio = first; - do { + for (;;) { dio = nio; nio = AVL_NEXT(t, dio); ASSERT3U(dio->io_type, ==, aio->io_type); + vlist->ds_len = dio->io_size; + VERIFY3U(vlist->ds_len, >, 0); if (dio->io_flags & ZIO_FLAG_NODATA) { ASSERT3U(dio->io_type, ==, ZIO_TYPE_WRITE); - bzero((char *)aio->io_data + (dio->io_offset - - aio->io_offset), dio->io_size); - } else if (dio->io_type == ZIO_TYPE_WRITE) { - bcopy(dio->io_data, (char *)aio->io_data + - (dio->io_offset - aio->io_offset), - dio->io_size); + vlist->ds_addr = (bus_addr_t)zero_region; + while (vlist->ds_len > max_opt_io) { + vlist[1].ds_addr = (bus_addr_t)zero_region; + vlist[1].ds_len = vlist->ds_len - max_opt_io; + vlist->ds_len = max_opt_io; + vlist++; + } + } else { + vlist->ds_addr = (bus_addr_t)dio->io_data; } + vlist++; zio_add_child(dio, aio); vdev_queue_io_remove(vq, dio); zio_vdev_io_bypass(dio); zio_execute(dio); - } while (dio != last); + + if (dio == last) + break; + + if (IO_GAP(dio, nio)) { + ASSERT3U(dio->io_type, ==, ZIO_TYPE_READ); + vlist->ds_len = IO_GAP(dio, nio); + vlist->ds_addr = (bus_addr_t)bitbucket_region; + while (vlist->ds_len > max_opt_io) { + vlist[1].ds_addr = (bus_addr_t)bitbucket_region; + vlist[1].ds_len = vlist->ds_len - max_opt_io; + vlist->ds_len = max_opt_io; + vlist++; + } + VERIFY3U(vlist->ds_len, >, 0); + vlist++; + } + } + aio->io_vcount = vlist - (bus_dma_segment_t *)aio->io_data; return (aio); } Index: sys/i386/include/vmparam.h =================================================================== --- sys/i386/include/vmparam.h +++ sys/i386/include/vmparam.h @@ -190,6 +190,7 @@ #endif #define ZERO_REGION_SIZE (64 * 1024) /* 64KB */ +#define BITBUCKET_REGION_SIZE (64 * 1024) /* 64KB */ #ifndef VM_MAX_AUTOTUNE_MAXUSERS #define VM_MAX_AUTOTUNE_MAXUSERS 384 Index: sys/mips/include/vmparam.h =================================================================== --- sys/mips/include/vmparam.h +++ sys/mips/include/vmparam.h @@ -183,6 +183,7 @@ #define VM_NFREEORDER 9 #define ZERO_REGION_SIZE (64 * 1024) /* 64KB */ +#define BITBUCKET_REGION_SIZE (64 * 1024) /* 64KB */ #ifndef __mips_n64 #define SFBUF Index: sys/powerpc/include/vmparam.h =================================================================== --- sys/powerpc/include/vmparam.h +++ sys/powerpc/include/vmparam.h @@ -195,6 +195,7 @@ #endif #define ZERO_REGION_SIZE (64 * 1024) /* 64KB */ +#define BITBUCKET_REGION_SIZE (64 * 1024) /* 64KB */ /* * On 32-bit OEA, the only purpose for which sf_buf is used is to implement Index: sys/sparc64/include/vmparam.h =================================================================== --- sys/sparc64/include/vmparam.h +++ sys/sparc64/include/vmparam.h @@ -237,6 +237,7 @@ * caching disabled. */ #define ZERO_REGION_SIZE PAGE_SIZE +#define BITBUCKET_REGION_SIZE PAGE_SIZE #define SFBUF #define SFBUF_MAP Index: sys/sys/systm.h =================================================================== --- sys/sys/systm.h +++ sys/sys/systm.h @@ -146,6 +146,7 @@ extern char **kenvp; extern const void *zero_region; /* address space maps to a zeroed page */ +extern void *bitbucket_region; /* address space maps to writable page */ extern int unmapped_buf_allowed; Index: sys/vm/vm_extern.h =================================================================== --- sys/vm/vm_extern.h +++ sys/vm/vm_extern.h @@ -69,6 +69,7 @@ boolean_t); void kmem_init(vm_offset_t, vm_offset_t); void kmem_init_zero_region(void); +void kmem_init_bitbucket_region(void); void kmeminit(void); void swapout_procs(int); Index: sys/vm/vm_init.c =================================================================== --- sys/vm/vm_init.c +++ sys/vm/vm_init.c @@ -161,6 +161,7 @@ #endif kmem_init_zero_region(); + kmem_init_bitbucket_region(); pmap_init(); vm_pager_init(); } Index: sys/vm/vm_kern.c =================================================================== --- sys/vm/vm_kern.c +++ sys/vm/vm_kern.c @@ -97,6 +97,9 @@ /* NB: Used by kernel debuggers. */ const u_long vm_maxuser_address = VM_MAXUSER_ADDRESS; +void *bitbucket_region; +CTASSERT((BITBUCKET_REGION_SIZE & PAGE_MASK) == 0); + SYSCTL_ULONG(_vm, OID_AUTO, min_kernel_address, CTLFLAG_RD, SYSCTL_NULL_ULONG_PTR, VM_MIN_KERNEL_ADDRESS, "Min kernel address"); @@ -505,6 +508,28 @@ zero_region = (const void *)addr; } +void +kmem_init_bitbucket_region(void) +{ + vm_offset_t addr, i; + vm_page_t m; + + /* + * Map a single physical page to a larger virtual range. + * This requires less looping in places that want large amounts of + * dma data must be discarded, while not using much more physical + * resources. + */ + addr = kva_alloc(BITBUCKET_REGION_SIZE); + m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | + VM_ALLOC_NOOBJ | VM_ALLOC_WIRED); + for (i = 0; i < BITBUCKET_REGION_SIZE; i += PAGE_SIZE) + pmap_qenter(addr + i, &m, 1); + pmap_protect(kernel_pmap, addr, addr + ZERO_REGION_SIZE, VM_PROT_WRITE); + + bitbucket_region = (void *)addr; +} + /* * kmem_init: *