diff --git a/cddl/lib/libzpool/Makefile b/cddl/lib/libzpool/Makefile --- a/cddl/lib/libzpool/Makefile +++ b/cddl/lib/libzpool/Makefile @@ -135,6 +135,7 @@ uberblock.c \ unique.c \ vdev.c \ + vdev_cache.c \ vdev_draid.c \ vdev_draid_rand.c \ vdev_file.c \ diff --git a/sys/cddl/compat/opensolaris/sys/atomic.h b/sys/cddl/compat/opensolaris/sys/atomic.h --- a/sys/cddl/compat/opensolaris/sys/atomic.h +++ b/sys/cddl/compat/opensolaris/sys/atomic.h @@ -157,7 +157,7 @@ return (atomic_add_64_nv(target, -1)); } -#ifdef __LP64__ +#if !defined(COMPAT_32BIT) && defined(__LP64__) static __inline void * atomic_cas_ptr(volatile void *target, void *cmp, void *newval) { @@ -171,6 +171,6 @@ return ((void *)atomic_cas_32((volatile uint32_t *)target, (uint32_t)cmp, (uint32_t)newval)); } -#endif /* __LP64__ */ +#endif /* !defined(COMPAT_32BIT) && defined(__LP64__) */ #endif /* !_OPENSOLARIS_SYS_ATOMIC_H_ */ diff --git a/sys/conf/files b/sys/conf/files --- a/sys/conf/files +++ b/sys/conf/files @@ -326,6 +326,7 @@ contrib/openzfs/module/zfs/uberblock.c optional zfs compile-with "${ZFS_C}" contrib/openzfs/module/zfs/unique.c optional zfs compile-with "${ZFS_C}" contrib/openzfs/module/zfs/vdev.c optional zfs compile-with "${ZFS_C}" +contrib/openzfs/module/zfs/vdev_cache.c optional zfs compile-with "${ZFS_C}" contrib/openzfs/module/zfs/vdev_draid.c optional zfs compile-with "${ZFS_C}" contrib/openzfs/module/zfs/vdev_draid_rand.c optional zfs compile-with "${ZFS_C}" contrib/openzfs/module/zfs/vdev_indirect.c optional zfs compile-with "${ZFS_C}" diff --git a/sys/conf/kern.pre.mk b/sys/conf/kern.pre.mk --- a/sys/conf/kern.pre.mk +++ b/sys/conf/kern.pre.mk @@ -252,7 +252,8 @@ # Special flags for managing the compat compiles for ZFS ZFS_CFLAGS+= -I$S/contrib/openzfs/module/icp/include \ ${CDDL_CFLAGS} -DBUILDING_ZFS -DHAVE_UIO_ZEROCOPY \ - -DWITH_NETDUMP -D__KERNEL__ -D_SYS_CONDVAR_H_ -DSMP + -DWITH_NETDUMP -D__KERNEL__ -D_SYS_CONDVAR_H_ -DSMP \ + -DIN_FREEBSD_BASE .if ${MACHINE_ARCH} == "amd64" ZFS_CFLAGS+= -D__x86_64 -DHAVE_SSE2 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 \ diff --git a/sys/contrib/openzfs/META b/sys/contrib/openzfs/META --- a/sys/contrib/openzfs/META +++ b/sys/contrib/openzfs/META @@ -1,10 +1,10 @@ Meta: 1 Name: zfs Branch: 1.0 -Version: 2.2.0 -Release: rc1 +Version: 2.1.99 +Release: 1 Release-Tags: relext License: CDDL Author: OpenZFS -Linux-Maximum: 6.3 +Linux-Maximum: 6.2 Linux-Minimum: 3.10 diff --git a/sys/contrib/openzfs/cmd/arc_summary b/sys/contrib/openzfs/cmd/arc_summary --- a/sys/contrib/openzfs/cmd/arc_summary +++ b/sys/contrib/openzfs/cmd/arc_summary @@ -64,6 +64,7 @@ SECTION_PATHS = {'arc': 'arcstats', 'dmu': 'dmu_tx', 'l2arc': 'arcstats', # L2ARC stuff lives in arcstats + 'vdev': 'vdev_cache_stats', 'zfetch': 'zfetchstats', 'zil': 'zil'} @@ -89,6 +90,8 @@ # Requires py36-sysctl on FreeBSD import sysctl + VDEV_CACHE_SIZE = 'vdev.cache_size' + def is_value(ctl): return ctl.type != sysctl.CTLTYPE_NODE @@ -132,6 +135,8 @@ SPL_PATH = '/sys/module/spl/parameters' TUNABLES_PATH = '/sys/module/zfs/parameters' + VDEV_CACHE_SIZE = 'zfs_vdev_cache_size' + def load_kstats(section): path = os.path.join(KSTAT_PATH, section) with open(path) as f: @@ -837,8 +842,7 @@ ('Free on write:', 'l2_free_on_write'), ('R/W clashes:', 'l2_rw_clash'), ('Bad checksums:', 'l2_cksum_bad'), - ('Read errors:', 'l2_io_error'), - ('Write errors:', 'l2_writes_error')) + ('I/O errors:', 'l2_io_error')) for title, value in l2_todo: prt_i1(title, f_hits(arc_stats[value])) @@ -874,20 +878,28 @@ prt_i2('Miss ratio:', f_perc(arc_stats['l2_misses'], l2_access_total), f_hits(arc_stats['l2_misses'])) + prt_i1('Feeds:', f_hits(arc_stats['l2_feeds'])) print() - print('L2ARC I/O:') - prt_i2('Reads:', - f_bytes(arc_stats['l2_read_bytes']), - f_hits(arc_stats['l2_hits'])) - prt_i2('Writes:', - f_bytes(arc_stats['l2_write_bytes']), - f_hits(arc_stats['l2_writes_sent'])) + print('L2ARC writes:') + + if arc_stats['l2_writes_done'] != arc_stats['l2_writes_sent']: + prt_i2('Writes sent:', 'FAULTED', f_hits(arc_stats['l2_writes_sent'])) + prt_i2('Done ratio:', + f_perc(arc_stats['l2_writes_done'], + arc_stats['l2_writes_sent']), + f_hits(arc_stats['l2_writes_done'])) + prt_i2('Error ratio:', + f_perc(arc_stats['l2_writes_error'], + arc_stats['l2_writes_sent']), + f_hits(arc_stats['l2_writes_error'])) + else: + prt_i2('Writes sent:', '100 %', f_hits(arc_stats['l2_writes_sent'])) print() print('L2ARC evicts:') - prt_i1('L1 cached:', f_hits(arc_stats['l2_evict_l1cached'])) - prt_i1('While reading:', f_hits(arc_stats['l2_evict_reading'])) + prt_i1('Lock retries:', f_hits(arc_stats['l2_evict_lock_retry'])) + prt_i1('Upon reading:', f_hits(arc_stats['l2_evict_reading'])) print() @@ -947,6 +959,35 @@ print() +def section_vdev(kstats_dict): + """Collect information on VDEV caches""" + + # Currently [Nov 2017] the VDEV cache is disabled, because it is actually + # harmful. When this is the case, we just skip the whole entry. See + # https://github.com/openzfs/zfs/blob/master/module/zfs/vdev_cache.c + # for details + tunables = get_vdev_params() + + if tunables[VDEV_CACHE_SIZE] == '0': + print('VDEV cache disabled, skipping section\n') + return + + vdev_stats = isolate_section('vdev_cache_stats', kstats_dict) + + vdev_cache_total = int(vdev_stats['hits']) +\ + int(vdev_stats['misses']) +\ + int(vdev_stats['delegations']) + + prt_1('VDEV cache summary:', f_hits(vdev_cache_total)) + prt_i2('Hit ratio:', f_perc(vdev_stats['hits'], vdev_cache_total), + f_hits(vdev_stats['hits'])) + prt_i2('Miss ratio:', f_perc(vdev_stats['misses'], vdev_cache_total), + f_hits(vdev_stats['misses'])) + prt_i2('Delegations:', f_perc(vdev_stats['delegations'], vdev_cache_total), + f_hits(vdev_stats['delegations'])) + print() + + def section_zil(kstats_dict): """Collect information on the ZFS Intent Log. Some of the information taken from https://github.com/openzfs/zfs/blob/master/include/sys/zil.h @@ -974,6 +1015,7 @@ 'l2arc': section_l2arc, 'spl': section_spl, 'tunables': section_tunables, + 'vdev': section_vdev, 'zil': section_zil} diff --git a/sys/contrib/openzfs/cmd/zdb/zdb.c b/sys/contrib/openzfs/cmd/zdb/zdb.c --- a/sys/contrib/openzfs/cmd/zdb/zdb.c +++ b/sys/contrib/openzfs/cmd/zdb/zdb.c @@ -33,7 +33,6 @@ * under sponsorship from the FreeBSD Foundation. * Copyright (c) 2021 Allan Jude * Copyright (c) 2021 Toomas Soome - * Copyright (c) 2023, Klara Inc. */ #include @@ -327,7 +326,7 @@ int err; struct sublivelist_verify *sv = args; - zfs_btree_create(&sv->sv_pair, sublivelist_block_refcnt_compare, NULL, + zfs_btree_create(&sv->sv_pair, sublivelist_block_refcnt_compare, sizeof (sublivelist_verify_block_refcnt_t)); err = bpobj_iterate_nofree(&dle->dle_bpobj, sublivelist_verify_blkptr, @@ -391,7 +390,7 @@ { (void) args; sublivelist_verify_t sv; - zfs_btree_create(&sv.sv_leftover, livelist_block_compare, NULL, + zfs_btree_create(&sv.sv_leftover, livelist_block_compare, sizeof (sublivelist_verify_block_t)); int err = sublivelist_verify_func(&sv, dle); zfs_btree_clear(&sv.sv_leftover); @@ -683,7 +682,7 @@ (void) printf("Verifying deleted livelist entries\n"); sublivelist_verify_t sv; - zfs_btree_create(&sv.sv_leftover, livelist_block_compare, NULL, + zfs_btree_create(&sv.sv_leftover, livelist_block_compare, sizeof (sublivelist_verify_block_t)); iterate_deleted_livelists(spa, livelist_verify, &sv); @@ -717,7 +716,7 @@ mv.mv_start = m->ms_start; mv.mv_end = m->ms_start + m->ms_size; zfs_btree_create(&mv.mv_livelist_allocs, - livelist_block_compare, NULL, + livelist_block_compare, sizeof (sublivelist_verify_block_t)); mv_populate_livelist_allocs(&mv, &sv); @@ -790,11 +789,8 @@ "\t\t[[/] [ ...]]\n" "\t%s [-AdiPv] [-e [-V] [-p ...]] [-U ] [-K ]\n" "\t\t[[/] [ ...]\n" - "\t%s -B [-e [-V] [-p ...]] [-I ]\n" - "\t\t[-o =]... [-t ] [-U ] [-x ]\n" - "\t\t[-K ] / []\n" "\t%s [-v] \n" - "\t%s -C [-A] [-U ] []\n" + "\t%s -C [-A] [-U ]\n" "\t%s -l [-Aqu] \n" "\t%s -m [-AFLPX] [-e [-V] [-p ...]] [-t ] " "[-U ]\n\t\t [ [ ...]]\n" @@ -806,7 +802,7 @@ "\t%s -S [-AP] [-e [-V] [-p ...]] [-U ] " "\n\n", cmdname, cmdname, cmdname, cmdname, cmdname, cmdname, cmdname, - cmdname, cmdname, cmdname, cmdname, cmdname); + cmdname, cmdname, cmdname, cmdname); (void) fprintf(stderr, " Dataset name must include at least one " "separator character '/' or '@'\n"); @@ -829,8 +825,6 @@ (void) fprintf(stderr, " Options to control amount of output:\n"); (void) fprintf(stderr, " -b --block-stats " "block statistics\n"); - (void) fprintf(stderr, " -B --backup " - "backup stream\n"); (void) fprintf(stderr, " -c --checksum " "checksum all metadata (twice for all data) blocks\n"); (void) fprintf(stderr, " -C --config " @@ -4881,81 +4875,6 @@ return (err); } -static int -dump_backup_bytes(objset_t *os, void *buf, int len, void *arg) -{ - const char *p = (const char *)buf; - ssize_t nwritten; - - (void) os; - (void) arg; - - /* Write the data out, handling short writes and signals. */ - while ((nwritten = write(STDOUT_FILENO, p, len)) < len) { - if (nwritten < 0) { - if (errno == EINTR) - continue; - return (errno); - } - p += nwritten; - len -= nwritten; - } - - return (0); -} - -static void -dump_backup(const char *pool, uint64_t objset_id, const char *flagstr) -{ - boolean_t embed = B_FALSE; - boolean_t large_block = B_FALSE; - boolean_t compress = B_FALSE; - boolean_t raw = B_FALSE; - - const char *c; - for (c = flagstr; c != NULL && *c != '\0'; c++) { - switch (*c) { - case 'e': - embed = B_TRUE; - break; - case 'L': - large_block = B_TRUE; - break; - case 'c': - compress = B_TRUE; - break; - case 'w': - raw = B_TRUE; - break; - default: - fprintf(stderr, "dump_backup: invalid flag " - "'%c'\n", *c); - return; - } - } - - if (isatty(STDOUT_FILENO)) { - fprintf(stderr, "dump_backup: stream cannot be written " - "to a terminal\n"); - return; - } - - offset_t off = 0; - dmu_send_outparams_t out = { - .dso_outfunc = dump_backup_bytes, - .dso_dryrun = B_FALSE, - }; - - int err = dmu_send_obj(pool, objset_id, /* fromsnap */0, embed, - large_block, compress, raw, /* saved */ B_FALSE, STDOUT_FILENO, - &off, &out); - if (err != 0) { - fprintf(stderr, "dump_backup: dmu_send_obj: %s\n", - strerror(err)); - return; - } -} - static int zdb_copy_object(objset_t *os, uint64_t srcobj, char *destfile) { @@ -8546,9 +8465,9 @@ */ zio_nowait(zio_vdev_child_io(zio, bp, vd, offset, pabd, psize, ZIO_TYPE_READ, ZIO_PRIORITY_SYNC_READ, - ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY | - ZIO_FLAG_CANFAIL | ZIO_FLAG_RAW | ZIO_FLAG_OPTIONAL, - NULL, NULL)); + ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_PROPAGATE | + ZIO_FLAG_DONT_RETRY | ZIO_FLAG_CANFAIL | ZIO_FLAG_RAW | + ZIO_FLAG_OPTIONAL, NULL, NULL)); } error = zio_wait(zio); @@ -8642,6 +8561,7 @@ zio_nowait(zio_vdev_child_io(czio, bp, vd, offset, pabd, psize, ZIO_TYPE_READ, ZIO_PRIORITY_SYNC_READ, + ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY | ZIO_FLAG_CANFAIL | ZIO_FLAG_RAW | @@ -8775,7 +8695,6 @@ struct option long_options[] = { {"ignore-assertions", no_argument, NULL, 'A'}, {"block-stats", no_argument, NULL, 'b'}, - {"backup", no_argument, NULL, 'B'}, {"checksum", no_argument, NULL, 'c'}, {"config", no_argument, NULL, 'C'}, {"datasets", no_argument, NULL, 'd'}, @@ -8817,11 +8736,10 @@ }; while ((c = getopt_long(argc, argv, - "AbBcCdDeEFGhiI:kK:lLmMNo:Op:PqrRsSt:uU:vVx:XYyZ", + "AbcCdDeEFGhiI:kK:lLmMNo:Op:PqrRsSt:uU:vVx:XYyZ", long_options, NULL)) != -1) { switch (c) { case 'b': - case 'B': case 'c': case 'C': case 'd': @@ -8969,7 +8887,7 @@ verbose = MAX(verbose, 1); for (c = 0; c < 256; c++) { - if (dump_all && strchr("ABeEFkKlLNOPrRSXy", c) == NULL) + if (dump_all && strchr("AeEFkKlLNOPrRSXy", c) == NULL) dump_opt[c] = 1; if (dump_opt[c]) dump_opt[c] += verbose; @@ -9155,8 +9073,7 @@ checkpoint_pool, error); } - } else if (target_is_spa || dump_opt['R'] || dump_opt['B'] || - objset_id == 0) { + } else if (target_is_spa || dump_opt['R'] || objset_id == 0) { zdb_set_skip_mmp(target); error = spa_open_rewind(target, &spa, FTAG, policy, NULL); @@ -9292,10 +9209,7 @@ strerror(errno)); } } - if (dump_opt['B']) { - dump_backup(target, objset_id, - argc > 0 ? argv[0] : NULL); - } else if (os != NULL) { + if (os != NULL) { dump_objset(os); } else if (zopt_object_args > 0 && !dump_opt['m']) { dump_objset(spa->spa_meta_objset); diff --git a/sys/contrib/openzfs/cmd/zed/agents/zfs_agents.c b/sys/contrib/openzfs/cmd/zed/agents/zfs_agents.c --- a/sys/contrib/openzfs/cmd/zed/agents/zfs_agents.c +++ b/sys/contrib/openzfs/cmd/zed/agents/zfs_agents.c @@ -369,7 +369,9 @@ return (NULL); } - if ((event = list_remove_head(&agent_events)) != NULL) { + if ((event = (list_head(&agent_events))) != NULL) { + list_remove(&agent_events, event); + (void) pthread_mutex_unlock(&agent_lock); /* dispatch to all event subscribers */ @@ -432,7 +434,8 @@ (void) pthread_join(g_agents_tid, NULL); /* drain any pending events */ - while ((event = list_remove_head(&agent_events)) != NULL) { + while ((event = (list_head(&agent_events))) != NULL) { + list_remove(&agent_events, event); nvlist_free(event->ae_nvl); free(event); } diff --git a/sys/contrib/openzfs/cmd/zed/agents/zfs_mod.c b/sys/contrib/openzfs/cmd/zed/agents/zfs_mod.c --- a/sys/contrib/openzfs/cmd/zed/agents/zfs_mod.c +++ b/sys/contrib/openzfs/cmd/zed/agents/zfs_mod.c @@ -1288,14 +1288,17 @@ tpool_destroy(g_tpool); } - while ((pool = list_remove_head(&g_pool_list)) != NULL) { + while ((pool = (list_head(&g_pool_list))) != NULL) { + list_remove(&g_pool_list, pool); zpool_close(pool->uap_zhp); free(pool); } list_destroy(&g_pool_list); - while ((device = list_remove_head(&g_device_list)) != NULL) + while ((device = (list_head(&g_device_list))) != NULL) { + list_remove(&g_device_list, device); free(device); + } list_destroy(&g_device_list); libzfs_fini(g_zfshdl); diff --git a/sys/contrib/openzfs/cmd/zfs/zfs_main.c b/sys/contrib/openzfs/cmd/zfs/zfs_main.c --- a/sys/contrib/openzfs/cmd/zfs/zfs_main.c +++ b/sys/contrib/openzfs/cmd/zfs/zfs_main.c @@ -6057,8 +6057,8 @@ if (p != NULL) rid = p->pw_uid; else if (*endch != '\0') { - (void) snprintf(errbuf, sizeof (errbuf), - gettext("invalid user %s\n"), curr); + (void) snprintf(errbuf, 256, gettext( + "invalid user %s\n"), curr); allow_usage(un, B_TRUE, errbuf); } } else if (opts->group) { @@ -6071,9 +6071,8 @@ if (g != NULL) rid = g->gr_gid; else if (*endch != '\0') { - (void) snprintf(errbuf, sizeof (errbuf), - gettext("invalid group %s\n"), - curr); + (void) snprintf(errbuf, 256, gettext( + "invalid group %s\n"), curr); allow_usage(un, B_TRUE, errbuf); } } else { @@ -6098,9 +6097,8 @@ who_type = ZFS_DELEG_GROUP; rid = g->gr_gid; } else { - (void) snprintf(errbuf, sizeof (errbuf), - gettext("invalid user/group %s\n"), - curr); + (void) snprintf(errbuf, 256, gettext( + "invalid user/group %s\n"), curr); allow_usage(un, B_TRUE, errbuf); } } diff --git a/sys/contrib/openzfs/cmd/zilstat.in b/sys/contrib/openzfs/cmd/zilstat.in --- a/sys/contrib/openzfs/cmd/zilstat.in +++ b/sys/contrib/openzfs/cmd/zilstat.in @@ -36,49 +36,31 @@ from argparse import RawTextHelpFormatter cols = { - # hdr: [size, scale, kstat name] + # hdr: [size, scale, kstat name] "time": [8, -1, "time"], "pool": [12, -1, "pool"], "ds": [12, -1, "dataset_name"], "obj": [12, -1, "objset"], - "cc": [5, 1000, "zil_commit_count"], - "cwc": [5, 1000, "zil_commit_writer_count"], - "ic": [5, 1000, "zil_itx_count"], - "iic": [5, 1000, "zil_itx_indirect_count"], - "iib": [5, 1024, "zil_itx_indirect_bytes"], - "icc": [5, 1000, "zil_itx_copied_count"], - "icb": [5, 1024, "zil_itx_copied_bytes"], - "inc": [5, 1000, "zil_itx_needcopy_count"], - "inb": [5, 1024, "zil_itx_needcopy_bytes"], - "idc": [5, 1000, "icc+inc"], - "idb": [5, 1024, "icb+inb"], - "iwc": [5, 1000, "iic+idc"], - "iwb": [5, 1024, "iib+idb"], - "imnc": [6, 1000, "zil_itx_metaslab_normal_count"], - "imnb": [6, 1024, "zil_itx_metaslab_normal_bytes"], - "imnw": [6, 1024, "zil_itx_metaslab_normal_write"], - "imna": [6, 1024, "zil_itx_metaslab_normal_alloc"], - "imsc": [6, 1000, "zil_itx_metaslab_slog_count"], - "imsb": [6, 1024, "zil_itx_metaslab_slog_bytes"], - "imsw": [6, 1024, "zil_itx_metaslab_slog_write"], - "imsa": [6, 1024, "zil_itx_metaslab_slog_alloc"], - "imc": [5, 1000, "imnc+imsc"], - "imb": [5, 1024, "imnb+imsb"], - "imw": [5, 1024, "imnw+imsw"], - "ima": [5, 1024, "imna+imsa"], - "se%": [3, 100, "imb/ima"], - "sen%": [4, 100, "imnb/imna"], - "ses%": [4, 100, "imsb/imsa"], - "te%": [3, 100, "imb/imw"], - "ten%": [4, 100, "imnb/imnw"], - "tes%": [4, 100, "imsb/imsw"], + "zcc": [10, 1000, "zil_commit_count"], + "zcwc": [10, 1000, "zil_commit_writer_count"], + "ziic": [10, 1000, "zil_itx_indirect_count"], + "zic": [10, 1000, "zil_itx_count"], + "ziib": [10, 1024, "zil_itx_indirect_bytes"], + "zicc": [10, 1000, "zil_itx_copied_count"], + "zicb": [10, 1024, "zil_itx_copied_bytes"], + "zinc": [10, 1000, "zil_itx_needcopy_count"], + "zinb": [10, 1024, "zil_itx_needcopy_bytes"], + "zimnc": [10, 1000, "zil_itx_metaslab_normal_count"], + "zimnb": [10, 1024, "zil_itx_metaslab_normal_bytes"], + "zimsc": [10, 1000, "zil_itx_metaslab_slog_count"], + "zimsb": [10, 1024, "zil_itx_metaslab_slog_bytes"], } -hdr = ["time", "ds", "cc", "ic", "idc", "idb", "iic", "iib", - "imnc", "imnw", "imsc", "imsw"] +hdr = ["time", "pool", "ds", "obj", "zcc", "zcwc", "ziic", "zic", "ziib", \ + "zicc", "zicb", "zinc", "zinb", "zimnc", "zimnb", "zimsc", "zimsb"] -ghdr = ["time", "cc", "ic", "idc", "idb", "iic", "iib", - "imnc", "imnw", "imsc", "imsw"] +ghdr = ["time", "zcc", "zcwc", "ziic", "zic", "ziib", "zicc", "zicb", + "zinc", "zinb", "zimnc", "zimnb", "zimsc", "zimsb"] cmd = ("Usage: zilstat [-hgdv] [-i interval] [-p pool_name]") @@ -123,7 +105,7 @@ global sep for col in hdr: new_col = col - if interval > 0 and cols[col][1] > 100: + if interval > 0 and col not in ['time', 'pool', 'ds', 'obj']: new_col += "/s" sys.stdout.write("%*s%s" % (cols[col][0], new_col, sep)) sys.stdout.write("\n") @@ -133,7 +115,7 @@ global sep for col in hdr: val = v[cols[col][2]] - if interval > 0 and cols[col][1] > 100: + if col not in ['time', 'pool', 'ds', 'obj'] and interval > 0: val = v[cols[col][2]] // interval sys.stdout.write("%s%s" % ( prettynum(cols[col][0], cols[col][1], val), sep)) @@ -255,7 +237,9 @@ invalid = [] for ele in hdr: - if ele not in cols: + if gFlag and ele not in ghdr: + invalid.append(ele) + elif ele not in cols: invalid.append(ele) if len(invalid) > 0: @@ -419,17 +403,17 @@ diff = copy.deepcopy(curr) for pool in curr: for objset in curr[pool]: - for key in curr[pool][objset]: - if not isinstance(diff[pool][objset][key], int): - continue - # If prev is NULL, this is the - # first time we are here - if not prev: - diff[pool][objset][key] = 0 - else: - diff[pool][objset][key] \ - = curr[pool][objset][key] \ - - prev[pool][objset][key] + for col in hdr: + if col not in ['time', 'pool', 'ds', 'obj']: + key = cols[col][2] + # If prev is NULL, this is the + # first time we are here + if not prev: + diff[pool][objset][key] = 0 + else: + diff[pool][objset][key] \ + = curr[pool][objset][key] \ + - prev[pool][objset][key] def zil_build_dict(pool = "GLOBAL"): global kstat @@ -441,77 +425,10 @@ if objset not in curr[pool]: curr[pool][objset] = dict() curr[pool][objset][key] = val - -def zil_extend_dict(): - global diff - for pool in diff: - for objset in diff[pool]: - diff[pool][objset]["pool"] = pool - diff[pool][objset]["objset"] = objset - diff[pool][objset]["time"] = time.strftime("%H:%M:%S", \ - time.localtime()) - diff[pool][objset]["icc+inc"] = \ - diff[pool][objset]["zil_itx_copied_count"] + \ - diff[pool][objset]["zil_itx_needcopy_count"] - diff[pool][objset]["icb+inb"] = \ - diff[pool][objset]["zil_itx_copied_bytes"] + \ - diff[pool][objset]["zil_itx_needcopy_bytes"] - diff[pool][objset]["iic+idc"] = \ - diff[pool][objset]["zil_itx_indirect_count"] + \ - diff[pool][objset]["zil_itx_copied_count"] + \ - diff[pool][objset]["zil_itx_needcopy_count"] - diff[pool][objset]["iib+idb"] = \ - diff[pool][objset]["zil_itx_indirect_bytes"] + \ - diff[pool][objset]["zil_itx_copied_bytes"] + \ - diff[pool][objset]["zil_itx_needcopy_bytes"] - diff[pool][objset]["imnc+imsc"] = \ - diff[pool][objset]["zil_itx_metaslab_normal_count"] + \ - diff[pool][objset]["zil_itx_metaslab_slog_count"] - diff[pool][objset]["imnb+imsb"] = \ - diff[pool][objset]["zil_itx_metaslab_normal_bytes"] + \ - diff[pool][objset]["zil_itx_metaslab_slog_bytes"] - diff[pool][objset]["imnw+imsw"] = \ - diff[pool][objset]["zil_itx_metaslab_normal_write"] + \ - diff[pool][objset]["zil_itx_metaslab_slog_write"] - diff[pool][objset]["imna+imsa"] = \ - diff[pool][objset]["zil_itx_metaslab_normal_alloc"] + \ - diff[pool][objset]["zil_itx_metaslab_slog_alloc"] - if diff[pool][objset]["imna+imsa"] > 0: - diff[pool][objset]["imb/ima"] = 100 * \ - diff[pool][objset]["imnb+imsb"] // \ - diff[pool][objset]["imna+imsa"] - else: - diff[pool][objset]["imb/ima"] = 100 - if diff[pool][objset]["zil_itx_metaslab_normal_alloc"] > 0: - diff[pool][objset]["imnb/imna"] = 100 * \ - diff[pool][objset]["zil_itx_metaslab_normal_bytes"] // \ - diff[pool][objset]["zil_itx_metaslab_normal_alloc"] - else: - diff[pool][objset]["imnb/imna"] = 100 - if diff[pool][objset]["zil_itx_metaslab_slog_alloc"] > 0: - diff[pool][objset]["imsb/imsa"] = 100 * \ - diff[pool][objset]["zil_itx_metaslab_slog_bytes"] // \ - diff[pool][objset]["zil_itx_metaslab_slog_alloc"] - else: - diff[pool][objset]["imsb/imsa"] = 100 - if diff[pool][objset]["imnw+imsw"] > 0: - diff[pool][objset]["imb/imw"] = 100 * \ - diff[pool][objset]["imnb+imsb"] // \ - diff[pool][objset]["imnw+imsw"] - else: - diff[pool][objset]["imb/imw"] = 100 - if diff[pool][objset]["zil_itx_metaslab_normal_alloc"] > 0: - diff[pool][objset]["imnb/imnw"] = 100 * \ - diff[pool][objset]["zil_itx_metaslab_normal_bytes"] // \ - diff[pool][objset]["zil_itx_metaslab_normal_write"] - else: - diff[pool][objset]["imnb/imnw"] = 100 - if diff[pool][objset]["zil_itx_metaslab_slog_alloc"] > 0: - diff[pool][objset]["imsb/imsw"] = 100 * \ - diff[pool][objset]["zil_itx_metaslab_slog_bytes"] // \ - diff[pool][objset]["zil_itx_metaslab_slog_write"] - else: - diff[pool][objset]["imsb/imsw"] = 100 + curr[pool][objset]["pool"] = pool + curr[pool][objset]["objset"] = objset + curr[pool][objset]["time"] = time.strftime("%H:%M:%S", \ + time.localtime()) def sign_handler_epipe(sig, frame): print("Caught EPIPE signal: " + str(frame)) @@ -520,31 +437,30 @@ def main(): global interval - global curr, diff + global curr hprint = False init() signal.signal(signal.SIGINT, signal.SIG_DFL) signal.signal(signal.SIGPIPE, sign_handler_epipe) - zil_process_kstat() - if not curr: - print ("Error: No stats to show") - sys.exit(0) - print_header() if interval > 0: - time.sleep(interval) while True: calculate_diff() if not diff: print ("Error: No stats to show") sys.exit(0) - zil_extend_dict() + if hprint == False: + print_header() + hprint = True print_dict(diff) time.sleep(interval) else: - diff = curr - zil_extend_dict() - print_dict(diff) + zil_process_kstat() + if not curr: + print ("Error: No stats to show") + sys.exit(0) + print_header() + print_dict(curr) if __name__ == '__main__': main() diff --git a/sys/contrib/openzfs/cmd/zpool/Makefile.am b/sys/contrib/openzfs/cmd/zpool/Makefile.am --- a/sys/contrib/openzfs/cmd/zpool/Makefile.am +++ b/sys/contrib/openzfs/cmd/zpool/Makefile.am @@ -145,7 +145,6 @@ %D%/compatibility.d/openzfs-2.0-linux \ %D%/compatibility.d/openzfs-2.1-freebsd \ %D%/compatibility.d/openzfs-2.1-linux \ - %D%/compatibility.d/openzfs-2.2 \ %D%/compatibility.d/openzfsonosx-1.7.0 \ %D%/compatibility.d/openzfsonosx-1.8.1 \ %D%/compatibility.d/openzfsonosx-1.9.3 \ @@ -169,20 +168,12 @@ "freebsd-11.3 freebsd-12.0" \ "freebsd-11.3 freebsd-12.1" \ "freebsd-11.3 freebsd-12.2" \ - "freebsd-11.3 freebsd-12.3" \ - "freebsd-11.3 freebsd-12.4" \ - "openzfs-2.1-freebsd freebsd-13.0" \ - "openzfs-2.1-freebsd freebsd-13.1" \ - "openzfs-2.1-freebsd freebsd-13.2" \ "freebsd-11.3 freenas-11.3" \ "freenas-11.0 freenas-11.1" \ "openzfsonosx-1.9.3 openzfsonosx-1.9.4" \ "openzfs-2.0-freebsd truenas-12.0" \ "zol-0.7 ubuntu-18.04" \ - "zol-0.8 ubuntu-20.04" \ - "openzfs-2.1-linux ubuntu-22.04" \ - "openzfs-2.2 openzfs-2.2-linux" \ - "openzfs-2.2 openzfs-2.2-freebsd" + "zol-0.8 ubuntu-20.04" zpoolconfdir = $(sysconfdir)/zfs/zpool.d INSTALL_DATA_HOOKS += zpool-install-data-hook diff --git a/sys/contrib/openzfs/cmd/zpool/compatibility.d/grub2 b/sys/contrib/openzfs/cmd/zpool/compatibility.d/grub2 --- a/sys/contrib/openzfs/cmd/zpool/compatibility.d/grub2 +++ b/sys/contrib/openzfs/cmd/zpool/compatibility.d/grub2 @@ -8,7 +8,5 @@ filesystem_limits hole_birth large_blocks -livelist lz4_compress spacemap_histogram -zpool_checkpoint diff --git a/sys/contrib/openzfs/cmd/zpool/compatibility.d/openzfs-2.2 b/sys/contrib/openzfs/cmd/zpool/compatibility.d/openzfs-2.2 deleted file mode 100644 --- a/sys/contrib/openzfs/cmd/zpool/compatibility.d/openzfs-2.2 +++ /dev/null @@ -1,40 +0,0 @@ -# Features supported by OpenZFS 2.2 on Linux and FreeBSD -allocation_classes -async_destroy -blake3 -block_cloning -bookmark_v2 -bookmark_written -bookmarks -device_rebuild -device_removal -draid -edonr -embedded_data -empty_bpobj -enabled_txg -encryption -extensible_dataset -filesystem_limits -head_errlog -hole_birth -large_blocks -large_dnode -livelist -log_spacemap -lz4_compress -multi_vdev_crash_dump -obsolete_counts -project_quota -redacted_datasets -redaction_bookmarks -resilver_defer -sha512 -skein -spacemap_histogram -spacemap_v2 -userobj_accounting -vdev_zaps_v2 -zilsaxattr -zpool_checkpoint -zstd_compress diff --git a/sys/contrib/openzfs/cmd/zpool/zpool_main.c b/sys/contrib/openzfs/cmd/zpool/zpool_main.c --- a/sys/contrib/openzfs/cmd/zpool/zpool_main.c +++ b/sys/contrib/openzfs/cmd/zpool/zpool_main.c @@ -7662,11 +7662,11 @@ print_scan_scrub_resilver_status(pool_scan_stat_t *ps) { time_t start, end, pause; - uint64_t pass_scanned, scanned, pass_issued, issued, total_s, total_i; + uint64_t pass_scanned, scanned, pass_issued, issued, total; uint64_t elapsed, scan_rate, issue_rate; double fraction_done; - char processed_buf[7], scanned_buf[7], issued_buf[7], total_s_buf[7]; - char total_i_buf[7], srate_buf[7], irate_buf[7], time_buf[32]; + char processed_buf[7], scanned_buf[7], issued_buf[7], total_buf[7]; + char srate_buf[7], irate_buf[7], time_buf[32]; printf(" "); printf_color(ANSI_BOLD, gettext("scan:")); @@ -7738,11 +7738,10 @@ pass_scanned = ps->pss_pass_exam; issued = ps->pss_issued; pass_issued = ps->pss_pass_issued; - total_s = ps->pss_to_examine; - total_i = ps->pss_to_examine - ps->pss_skipped; + total = ps->pss_to_examine; /* we are only done with a block once we have issued the IO for it */ - fraction_done = (double)issued / total_i; + fraction_done = (double)issued / total; /* elapsed time for this pass, rounding up to 1 if it's 0 */ elapsed = time(NULL) - ps->pss_pass_start; @@ -7751,25 +7750,26 @@ scan_rate = pass_scanned / elapsed; issue_rate = pass_issued / elapsed; + uint64_t total_secs_left = (issue_rate != 0 && total >= issued) ? + ((total - issued) / issue_rate) : UINT64_MAX; + secs_to_dhms(total_secs_left, time_buf); /* format all of the numbers we will be reporting */ zfs_nicebytes(scanned, scanned_buf, sizeof (scanned_buf)); zfs_nicebytes(issued, issued_buf, sizeof (issued_buf)); - zfs_nicebytes(total_s, total_s_buf, sizeof (total_s_buf)); - zfs_nicebytes(total_i, total_i_buf, sizeof (total_i_buf)); + zfs_nicebytes(total, total_buf, sizeof (total_buf)); + zfs_nicebytes(scan_rate, srate_buf, sizeof (srate_buf)); + zfs_nicebytes(issue_rate, irate_buf, sizeof (irate_buf)); /* do not print estimated time if we have a paused scrub */ - (void) printf(gettext("\t%s / %s scanned"), scanned_buf, total_s_buf); - if (pause == 0 && scan_rate > 0) { - zfs_nicebytes(scan_rate, srate_buf, sizeof (srate_buf)); - (void) printf(gettext(" at %s/s"), srate_buf); - } - (void) printf(gettext(", %s / %s issued"), issued_buf, total_i_buf); - if (pause == 0 && issue_rate > 0) { - zfs_nicebytes(issue_rate, irate_buf, sizeof (irate_buf)); - (void) printf(gettext(" at %s/s"), irate_buf); + if (pause == 0) { + (void) printf(gettext("\t%s scanned at %s/s, " + "%s issued at %s/s, %s total\n"), + scanned_buf, srate_buf, issued_buf, irate_buf, total_buf); + } else { + (void) printf(gettext("\t%s scanned, %s issued, %s total\n"), + scanned_buf, issued_buf, total_buf); } - (void) printf(gettext("\n")); if (is_resilver) { (void) printf(gettext("\t%s resilvered, %.2f%% done"), @@ -7782,16 +7782,16 @@ if (pause == 0) { /* * Only provide an estimate iff: - * 1) we haven't yet issued all we expected, and + * 1) the time remaining is valid, and * 2) the issue rate exceeds 10 MB/s, and * 3) it's either: * a) a resilver which has started repairs, or * b) a scrub which has entered the issue phase. */ - if (total_i >= issued && issue_rate >= 10 * 1024 * 1024 && + if (total_secs_left != UINT64_MAX && + issue_rate >= 10 * 1024 * 1024 && ((is_resilver && ps->pss_processed > 0) || (is_scrub && issued > 0))) { - secs_to_dhms((total_i - issued) / issue_rate, time_buf); (void) printf(gettext(", %s to go\n"), time_buf); } else { (void) printf(gettext(", no estimated " @@ -7803,7 +7803,7 @@ } static void -print_rebuild_status_impl(vdev_rebuild_stat_t *vrs, uint_t c, char *vdev_name) +print_rebuild_status_impl(vdev_rebuild_stat_t *vrs, char *vdev_name) { if (vrs == NULL || vrs->vrs_state == VDEV_REBUILD_NONE) return; @@ -7815,20 +7815,17 @@ uint64_t bytes_scanned = vrs->vrs_bytes_scanned; uint64_t bytes_issued = vrs->vrs_bytes_issued; uint64_t bytes_rebuilt = vrs->vrs_bytes_rebuilt; - uint64_t bytes_est_s = vrs->vrs_bytes_est; - uint64_t bytes_est_i = vrs->vrs_bytes_est; - if (c > offsetof(vdev_rebuild_stat_t, vrs_pass_bytes_skipped) / 8) - bytes_est_i -= vrs->vrs_pass_bytes_skipped; + uint64_t bytes_est = vrs->vrs_bytes_est; uint64_t scan_rate = (vrs->vrs_pass_bytes_scanned / (vrs->vrs_pass_time_ms + 1)) * 1000; uint64_t issue_rate = (vrs->vrs_pass_bytes_issued / (vrs->vrs_pass_time_ms + 1)) * 1000; double scan_pct = MIN((double)bytes_scanned * 100 / - (bytes_est_s + 1), 100); + (bytes_est + 1), 100); /* Format all of the numbers we will be reporting */ char bytes_scanned_buf[7], bytes_issued_buf[7]; - char bytes_rebuilt_buf[7], bytes_est_s_buf[7], bytes_est_i_buf[7]; + char bytes_rebuilt_buf[7], bytes_est_buf[7]; char scan_rate_buf[7], issue_rate_buf[7], time_buf[32]; zfs_nicebytes(bytes_scanned, bytes_scanned_buf, sizeof (bytes_scanned_buf)); @@ -7836,8 +7833,9 @@ sizeof (bytes_issued_buf)); zfs_nicebytes(bytes_rebuilt, bytes_rebuilt_buf, sizeof (bytes_rebuilt_buf)); - zfs_nicebytes(bytes_est_s, bytes_est_s_buf, sizeof (bytes_est_s_buf)); - zfs_nicebytes(bytes_est_i, bytes_est_i_buf, sizeof (bytes_est_i_buf)); + zfs_nicebytes(bytes_est, bytes_est_buf, sizeof (bytes_est_buf)); + zfs_nicebytes(scan_rate, scan_rate_buf, sizeof (scan_rate_buf)); + zfs_nicebytes(issue_rate, issue_rate_buf, sizeof (issue_rate_buf)); time_t start = vrs->vrs_start_time; time_t end = vrs->vrs_end_time; @@ -7860,29 +7858,17 @@ assert(vrs->vrs_state == VDEV_REBUILD_ACTIVE); - (void) printf(gettext("\t%s / %s scanned"), bytes_scanned_buf, - bytes_est_s_buf); - if (scan_rate > 0) { - zfs_nicebytes(scan_rate, scan_rate_buf, sizeof (scan_rate_buf)); - (void) printf(gettext(" at %s/s"), scan_rate_buf); - } - (void) printf(gettext(", %s / %s issued"), bytes_issued_buf, - bytes_est_i_buf); - if (issue_rate > 0) { - zfs_nicebytes(issue_rate, issue_rate_buf, - sizeof (issue_rate_buf)); - (void) printf(gettext(" at %s/s"), issue_rate_buf); - } - (void) printf(gettext("\n")); + secs_to_dhms(MAX((int64_t)bytes_est - (int64_t)bytes_scanned, 0) / + MAX(scan_rate, 1), time_buf); + (void) printf(gettext("\t%s scanned at %s/s, %s issued %s/s, " + "%s total\n"), bytes_scanned_buf, scan_rate_buf, + bytes_issued_buf, issue_rate_buf, bytes_est_buf); (void) printf(gettext("\t%s resilvered, %.2f%% done"), bytes_rebuilt_buf, scan_pct); if (vrs->vrs_state == VDEV_REBUILD_ACTIVE) { - if (bytes_est_s >= bytes_scanned && - scan_rate >= 10 * 1024 * 1024) { - secs_to_dhms((bytes_est_s - bytes_scanned) / scan_rate, - time_buf); + if (scan_rate >= 10 * 1024 * 1024) { (void) printf(gettext(", %s to go\n"), time_buf); } else { (void) printf(gettext(", no estimated " @@ -7914,7 +7900,7 @@ ZPOOL_CONFIG_REBUILD_STATS, (uint64_t **)&vrs, &i) == 0) { char *name = zpool_vdev_name(g_zfs, zhp, child[c], VDEV_NAME_TYPE_ID); - print_rebuild_status_impl(vrs, i, name); + print_rebuild_status_impl(vrs, name); free(name); } } @@ -8019,15 +8005,13 @@ active_resilver = (ps->pss_state == DSS_SCANNING); } + have_resilver = (ps->pss_func == POOL_SCAN_RESILVER); have_scrub = (ps->pss_func == POOL_SCAN_SCRUB); scrub_start = ps->pss_start_time; - if (c > offsetof(pool_scan_stat_t, - pss_pass_error_scrub_pause) / 8) { - have_errorscrub = (ps->pss_error_scrub_func == - POOL_SCAN_ERRORSCRUB); - errorscrub_start = ps->pss_error_scrub_start; - } + have_errorscrub = (ps->pss_error_scrub_func == + POOL_SCAN_ERRORSCRUB); + errorscrub_start = ps->pss_error_scrub_start; } boolean_t active_rebuild = check_rebuilding(nvroot, &rebuild_end_time); diff --git a/sys/contrib/openzfs/cmd/zpool_influxdb/zpool_influxdb.c b/sys/contrib/openzfs/cmd/zpool_influxdb/zpool_influxdb.c --- a/sys/contrib/openzfs/cmd/zpool_influxdb/zpool_influxdb.c +++ b/sys/contrib/openzfs/cmd/zpool_influxdb/zpool_influxdb.c @@ -238,7 +238,6 @@ print_kv("end_ts", ps->pss_end_time); print_kv(",errors", ps->pss_errors); print_kv(",examined", examined); - print_kv(",skipped", ps->pss_skipped); print_kv(",issued", ps->pss_issued); print_kv(",pass_examined", pass_exam); print_kv(",pass_issued", ps->pss_pass_issued); @@ -250,6 +249,7 @@ print_kv(",remaining_t", remaining_time); print_kv(",start_ts", ps->pss_start_time); print_kv(",to_examine", ps->pss_to_examine); + print_kv(",to_process", ps->pss_to_process); printf(" %llu\n", (u_longlong_t)timestamp); return (0); } diff --git a/sys/contrib/openzfs/config/kernel-reclaim_state.m4 b/sys/contrib/openzfs/config/kernel-reclaim_state.m4 deleted file mode 100644 --- a/sys/contrib/openzfs/config/kernel-reclaim_state.m4 +++ /dev/null @@ -1,26 +0,0 @@ -AC_DEFUN([ZFS_AC_KERNEL_SRC_RECLAIMED], [ - dnl # - dnl # 6.4 API change - dnl # The reclaimed_slab of struct reclaim_state - dnl # is renamed to reclaimed - dnl # - ZFS_LINUX_TEST_SRC([reclaim_state_reclaimed], [ - #include - static const struct reclaim_state - rs __attribute__ ((unused)) = { - .reclaimed = 100, - }; - ],[]) -]) - -AC_DEFUN([ZFS_AC_KERNEL_RECLAIMED], [ - AC_MSG_CHECKING([whether struct reclaim_state has reclaimed field]) - ZFS_LINUX_TEST_RESULT([reclaim_state_reclaimed], [ - AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_RECLAIM_STATE_RECLAIMED, 1, - [struct reclaim_state has reclaimed]) - ],[ - AC_MSG_RESULT(no) - ]) -]) - diff --git a/sys/contrib/openzfs/config/kernel.m4 b/sys/contrib/openzfs/config/kernel.m4 --- a/sys/contrib/openzfs/config/kernel.m4 +++ b/sys/contrib/openzfs/config/kernel.m4 @@ -153,7 +153,6 @@ ZFS_AC_KERNEL_SRC_IATTR_VFSID ZFS_AC_KERNEL_SRC_FILEMAP ZFS_AC_KERNEL_SRC_WRITEPAGE_T - ZFS_AC_KERNEL_SRC_RECLAIMED case "$host_cpu" in powerpc*) ZFS_AC_KERNEL_SRC_CPU_HAS_FEATURE @@ -286,7 +285,6 @@ ZFS_AC_KERNEL_IATTR_VFSID ZFS_AC_KERNEL_FILEMAP ZFS_AC_KERNEL_WRITEPAGE_T - ZFS_AC_KERNEL_RECLAIMED case "$host_cpu" in powerpc*) ZFS_AC_KERNEL_CPU_HAS_FEATURE diff --git a/sys/contrib/openzfs/contrib/debian/openzfs-zfs-zed.zfs-zed.init b/sys/contrib/openzfs/contrib/debian/openzfs-zfs-zed.zfs-zed.init new file mode 120000 --- /dev/null +++ b/sys/contrib/openzfs/contrib/debian/openzfs-zfs-zed.zfs-zed.init @@ -0,0 +1 @@ +../etc/init.d/zfs-zed \ No newline at end of file diff --git a/sys/contrib/openzfs/contrib/debian/openzfs-zfsutils.zfs-import.init b/sys/contrib/openzfs/contrib/debian/openzfs-zfsutils.zfs-import.init new file mode 120000 --- /dev/null +++ b/sys/contrib/openzfs/contrib/debian/openzfs-zfsutils.zfs-import.init @@ -0,0 +1 @@ +../etc/init.d/zfs-import \ No newline at end of file diff --git a/sys/contrib/openzfs/contrib/debian/openzfs-zfsutils.zfs-load-key.init b/sys/contrib/openzfs/contrib/debian/openzfs-zfsutils.zfs-load-key.init new file mode 120000 --- /dev/null +++ b/sys/contrib/openzfs/contrib/debian/openzfs-zfsutils.zfs-load-key.init @@ -0,0 +1 @@ +../etc/init.d/zfs-load-key \ No newline at end of file diff --git a/sys/contrib/openzfs/contrib/debian/openzfs-zfsutils.zfs-mount.init b/sys/contrib/openzfs/contrib/debian/openzfs-zfsutils.zfs-mount.init new file mode 120000 --- /dev/null +++ b/sys/contrib/openzfs/contrib/debian/openzfs-zfsutils.zfs-mount.init @@ -0,0 +1 @@ +../etc/init.d/zfs-mount \ No newline at end of file diff --git a/sys/contrib/openzfs/contrib/debian/openzfs-zfsutils.zfs-share.init b/sys/contrib/openzfs/contrib/debian/openzfs-zfsutils.zfs-share.init new file mode 120000 --- /dev/null +++ b/sys/contrib/openzfs/contrib/debian/openzfs-zfsutils.zfs-share.init @@ -0,0 +1 @@ +../etc/init.d/zfs-share \ No newline at end of file diff --git a/sys/contrib/openzfs/contrib/debian/rules.in b/sys/contrib/openzfs/contrib/debian/rules.in --- a/sys/contrib/openzfs/contrib/debian/rules.in +++ b/sys/contrib/openzfs/contrib/debian/rules.in @@ -7,8 +7,8 @@ LINUX_MIN := $(shell awk '/Linux-Minimum:/{print $$2}' META) LINUX_NEXT := $(shell awk -F'[ .]' '/Linux-Maximum:/{print $$2 "." $$3+1}' META) -DKMSFILES := module include config zfs.release.in autogen.sh copy-builtin META AUTHORS \ - COPYRIGHT LICENSE README.md CODE_OF_CONDUCT.md NEWS NOTICE RELEASES.md +DKMSFILES := module include config zfs.release.in autogen.sh META AUTHORS \ + COPYRIGHT LICENSE README.md ifndef KVERS KVERS=$(shell uname -r) diff --git a/sys/contrib/openzfs/contrib/dracut/90zfs/module-setup.sh.in b/sys/contrib/openzfs/contrib/dracut/90zfs/module-setup.sh.in --- a/sys/contrib/openzfs/contrib/dracut/90zfs/module-setup.sh.in +++ b/sys/contrib/openzfs/contrib/dracut/90zfs/module-setup.sh.in @@ -36,7 +36,7 @@ { dfatal "Failed to install essential binaries"; exit 1; } # Adapted from https://github.com/zbm-dev/zfsbootmenu - if ! ldd "$(command -v zpool)" | grep -qF 'libgcc_s.so' && ldconfig -p 2> /dev/null | grep -qF 'libc.so.6' ; then + if ! ldd "$(command -v zpool)" | grep -qF 'libgcc_s.so'; then # On systems with gcc-config (Gentoo, Funtoo, etc.), use it to find libgcc_s if command -v gcc-config >/dev/null; then inst_simple "/usr/lib/gcc/$(s=$(gcc-config -c); echo "${s%-*}/${s##*-}")/libgcc_s.so.1" || diff --git a/sys/contrib/openzfs/contrib/initramfs/scripts/zfs b/sys/contrib/openzfs/contrib/initramfs/scripts/zfs --- a/sys/contrib/openzfs/contrib/initramfs/scripts/zfs +++ b/sys/contrib/openzfs/contrib/initramfs/scripts/zfs @@ -344,7 +344,7 @@ # Need the _original_ datasets mountpoint! mountpoint=$(get_fs_value "$fs" mountpoint) - ZFS_CMD="mount -o zfsutil -t zfs" + ZFS_CMD="mount.zfs -o zfsutil" if [ "$mountpoint" = "legacy" ] || [ "$mountpoint" = "none" ]; then # Can't use the mountpoint property. Might be one of our # clones. Check the 'org.zol:mountpoint' property set in @@ -361,7 +361,7 @@ fi # Don't use mount.zfs -o zfsutils for legacy mountpoint if [ "$mountpoint" = "legacy" ]; then - ZFS_CMD="mount -t zfs" + ZFS_CMD="mount.zfs" fi # Last hail-mary: Hope 'rootmnt' is set! mountpoint="" @@ -944,7 +944,7 @@ echo " not specified on the kernel command line." echo "" echo "Manually mount the root filesystem on $rootmnt and then exit." - echo "Hint: Try: mount -o zfsutil -t zfs ${ZFS_RPOOL-rpool}/ROOT/system $rootmnt" + echo "Hint: Try: mount.zfs -o zfsutil ${ZFS_RPOOL-rpool}/ROOT/system $rootmnt" shell fi diff --git a/sys/contrib/openzfs/contrib/pam_zfs_key/pam_zfs_key.c b/sys/contrib/openzfs/contrib/pam_zfs_key/pam_zfs_key.c --- a/sys/contrib/openzfs/contrib/pam_zfs_key/pam_zfs_key.c +++ b/sys/contrib/openzfs/contrib/pam_zfs_key/pam_zfs_key.c @@ -67,7 +67,6 @@ #include static const char PASSWORD_VAR_NAME[] = "pam_zfs_key_authtok"; -static const char OLD_PASSWORD_VAR_NAME[] = "pam_zfs_key_oldauthtok"; static libzfs_handle_t *g_zfs; @@ -161,10 +160,10 @@ } static pw_password_t * -pw_fetch(pam_handle_t *pamh, int tok) +pw_fetch(pam_handle_t *pamh) { const char *token; - if (pam_get_authtok(pamh, tok, &token, NULL) != PAM_SUCCESS) { + if (pam_get_authtok(pamh, PAM_AUTHTOK, &token, NULL) != PAM_SUCCESS) { pam_syslog(pamh, LOG_ERR, "couldn't get password from PAM stack"); return (NULL); @@ -178,13 +177,13 @@ } static const pw_password_t * -pw_fetch_lazy(pam_handle_t *pamh, int tok, const char *var_name) +pw_fetch_lazy(pam_handle_t *pamh) { - pw_password_t *pw = pw_fetch(pamh, tok); + pw_password_t *pw = pw_fetch(pamh); if (pw == NULL) { return (NULL); } - int ret = pam_set_data(pamh, var_name, pw, destroy_pw); + int ret = pam_set_data(pamh, PASSWORD_VAR_NAME, pw, destroy_pw); if (ret != PAM_SUCCESS) { pw_free(pw); pam_syslog(pamh, LOG_ERR, "pam_set_data failed"); @@ -194,23 +193,23 @@ } static const pw_password_t * -pw_get(pam_handle_t *pamh, int tok, const char *var_name) +pw_get(pam_handle_t *pamh) { const pw_password_t *authtok = NULL; - int ret = pam_get_data(pamh, var_name, + int ret = pam_get_data(pamh, PASSWORD_VAR_NAME, (const void**)(&authtok)); if (ret == PAM_SUCCESS) return (authtok); if (ret == PAM_NO_MODULE_DATA) - return (pw_fetch_lazy(pamh, tok, var_name)); + return (pw_fetch_lazy(pamh)); pam_syslog(pamh, LOG_ERR, "password not available"); return (NULL); } static int -pw_clear(pam_handle_t *pamh, const char *var_name) +pw_clear(pam_handle_t *pamh) { - int ret = pam_set_data(pamh, var_name, NULL, NULL); + int ret = pam_set_data(pamh, PASSWORD_VAR_NAME, NULL, NULL); if (ret != PAM_SUCCESS) { pam_syslog(pamh, LOG_ERR, "clearing password failed"); return (-1); @@ -387,7 +386,7 @@ int ret = lzc_load_key(ds_name, noop, (uint8_t *)key->value, WRAPPING_KEY_LEN); pw_free(key); - if (ret && ret != EEXIST) { + if (ret) { pam_syslog(pamh, LOG_ERR, "load_key failed: %d", ret); zfs_close(ds); return (-1); @@ -407,14 +406,14 @@ } static int -unmount_unload(pam_handle_t *pamh, const char *ds_name, boolean_t force) +unmount_unload(pam_handle_t *pamh, const char *ds_name) { zfs_handle_t *ds = zfs_open(g_zfs, ds_name, ZFS_TYPE_FILESYSTEM); if (ds == NULL) { pam_syslog(pamh, LOG_ERR, "dataset %s not found", ds_name); return (-1); } - int ret = zfs_unmount(ds, NULL, force ? MS_FORCE : 0); + int ret = zfs_unmount(ds, NULL, 0); if (ret) { pam_syslog(pamh, LOG_ERR, "zfs_unmount failed with: %d", ret); zfs_close(ds); @@ -436,13 +435,9 @@ char *runstatedir; char *homedir; char *dsname; - uid_t uid_min; - uid_t uid_max; uid_t uid; const char *username; - boolean_t unmount_and_unload; - boolean_t force_unmount; - boolean_t recursive_homes; + int unmount_and_unload; } zfs_key_config_t; static int @@ -474,13 +469,9 @@ free(config->homes_prefix); return (PAM_USER_UNKNOWN); } - config->uid_min = 1000; - config->uid_max = MAXUID; config->uid = entry->pw_uid; config->username = name; - config->unmount_and_unload = B_TRUE; - config->force_unmount = B_FALSE; - config->recursive_homes = B_FALSE; + config->unmount_and_unload = 1; config->dsname = NULL; config->homedir = NULL; for (int c = 0; c < argc; c++) { @@ -490,16 +481,8 @@ } else if (strncmp(argv[c], "runstatedir=", 12) == 0) { free(config->runstatedir); config->runstatedir = strdup(argv[c] + 12); - } else if (strncmp(argv[c], "uid_min=", 8) == 0) { - sscanf(argv[c] + 8, "%u", &config->uid_min); - } else if (strncmp(argv[c], "uid_max=", 8) == 0) { - sscanf(argv[c] + 8, "%u", &config->uid_max); } else if (strcmp(argv[c], "nounmount") == 0) { - config->unmount_and_unload = B_FALSE; - } else if (strcmp(argv[c], "forceunmount") == 0) { - config->force_unmount = B_TRUE; - } else if (strcmp(argv[c], "recursive_homes") == 0) { - config->recursive_homes = B_TRUE; + config->unmount_and_unload = 0; } else if (strcmp(argv[c], "prop_mountpoint") == 0) { if (config->homedir == NULL) config->homedir = strdup(entry->pw_dir); @@ -534,12 +517,8 @@ (void) zfs_prop_get(zhp, ZFS_PROP_MOUNTPOINT, mountpoint, sizeof (mountpoint), NULL, NULL, 0, B_FALSE); if (strcmp(target->homedir, mountpoint) != 0) { - if (target->recursive_homes) { - (void) zfs_iter_filesystems_v2(zhp, 0, - find_dsname_by_prop_value, target); - } zfs_close(zhp); - return (target->dsname != NULL); + return (0); } target->dsname = strdup(zfs_get_name(zhp)); @@ -552,23 +531,17 @@ { if (config->homedir != NULL && config->homes_prefix != NULL) { - if (strcmp(config->homes_prefix, "*") == 0) { - (void) zfs_iter_root(g_zfs, - find_dsname_by_prop_value, config); - } else { - zfs_handle_t *zhp = zfs_open(g_zfs, - config->homes_prefix, ZFS_TYPE_FILESYSTEM); - if (zhp == NULL) { - pam_syslog(NULL, LOG_ERR, - "dataset %s not found", - config->homes_prefix); - return (NULL); - } - - (void) zfs_iter_filesystems_v2(zhp, 0, - find_dsname_by_prop_value, config); - zfs_close(zhp); + zfs_handle_t *zhp = zfs_open(g_zfs, config->homes_prefix, + ZFS_TYPE_FILESYSTEM); + if (zhp == NULL) { + pam_syslog(NULL, LOG_ERR, "dataset %s not found", + config->homes_prefix); + return (NULL); } + + (void) zfs_iter_filesystems_v2(zhp, 0, + find_dsname_by_prop_value, config); + zfs_close(zhp); char *dsname = config->dsname; config->dsname = NULL; return (dsname); @@ -682,13 +655,8 @@ if (config_err != PAM_SUCCESS) { return (config_err); } - if (config.uid < config.uid_min || config.uid > config.uid_max) { - zfs_key_config_free(&config); - return (PAM_SERVICE_ERR); - } - const pw_password_t *token = pw_fetch_lazy(pamh, - PAM_AUTHTOK, PASSWORD_VAR_NAME); + const pw_password_t *token = pw_fetch_lazy(pamh); if (token == NULL) { zfs_key_config_free(&config); return (PAM_AUTH_ERR); @@ -738,12 +706,10 @@ if (zfs_key_config_load(pamh, &config, argc, argv) != PAM_SUCCESS) { return (PAM_SERVICE_ERR); } - if (config.uid < config.uid_min || config.uid > config.uid_max) { + if (config.uid < 1000) { zfs_key_config_free(&config); - return (PAM_SERVICE_ERR); + return (PAM_SUCCESS); } - const pw_password_t *old_token = pw_get(pamh, - PAM_OLDAUTHTOK, OLD_PASSWORD_VAR_NAME); { if (pam_zfs_init(pamh) != 0) { zfs_key_config_free(&config); @@ -755,62 +721,49 @@ zfs_key_config_free(&config); return (PAM_SERVICE_ERR); } - if (!old_token) { - pam_syslog(pamh, LOG_ERR, - "old password from PAM stack is null"); + int key_loaded = is_key_loaded(pamh, dataset); + if (key_loaded == -1) { free(dataset); pam_zfs_free(); zfs_key_config_free(&config); return (PAM_SERVICE_ERR); } - if (decrypt_mount(pamh, dataset, - old_token->value, B_TRUE) == -1) { + free(dataset); + pam_zfs_free(); + if (! key_loaded) { pam_syslog(pamh, LOG_ERR, - "old token mismatch"); - free(dataset); - pam_zfs_free(); + "key not loaded, returning try_again"); zfs_key_config_free(&config); return (PAM_PERM_DENIED); } } if ((flags & PAM_UPDATE_AUTHTOK) != 0) { - const pw_password_t *token = pw_get(pamh, PAM_AUTHTOK, - PASSWORD_VAR_NAME); + const pw_password_t *token = pw_get(pamh); if (token == NULL) { - pam_syslog(pamh, LOG_ERR, "new password unavailable"); - pam_zfs_free(); zfs_key_config_free(&config); - pw_clear(pamh, OLD_PASSWORD_VAR_NAME); + return (PAM_SERVICE_ERR); + } + if (pam_zfs_init(pamh) != 0) { + zfs_key_config_free(&config); return (PAM_SERVICE_ERR); } char *dataset = zfs_key_config_get_dataset(&config); if (!dataset) { pam_zfs_free(); zfs_key_config_free(&config); - pw_clear(pamh, OLD_PASSWORD_VAR_NAME); - pw_clear(pamh, PASSWORD_VAR_NAME); return (PAM_SERVICE_ERR); } - int was_loaded = is_key_loaded(pamh, dataset); - if (!was_loaded && decrypt_mount(pamh, dataset, - old_token->value, B_FALSE) == -1) { + if (change_key(pamh, dataset, token->value) == -1) { free(dataset); pam_zfs_free(); zfs_key_config_free(&config); - pw_clear(pamh, OLD_PASSWORD_VAR_NAME); - pw_clear(pamh, PASSWORD_VAR_NAME); return (PAM_SERVICE_ERR); } - int changed = change_key(pamh, dataset, token->value); - if (!was_loaded) { - unmount_unload(pamh, dataset, config.force_unmount); - } free(dataset); pam_zfs_free(); zfs_key_config_free(&config); - if (pw_clear(pamh, OLD_PASSWORD_VAR_NAME) == -1 || - pw_clear(pamh, PASSWORD_VAR_NAME) == -1 || changed == -1) { + if (pw_clear(pamh) == -1) { return (PAM_SERVICE_ERR); } } else { @@ -835,7 +788,7 @@ return (PAM_SESSION_ERR); } - if (config.uid < config.uid_min || config.uid > config.uid_max) { + if (config.uid < 1000) { zfs_key_config_free(&config); return (PAM_SUCCESS); } @@ -846,8 +799,7 @@ return (PAM_SUCCESS); } - const pw_password_t *token = pw_get(pamh, - PAM_AUTHTOK, PASSWORD_VAR_NAME); + const pw_password_t *token = pw_get(pamh); if (token == NULL) { zfs_key_config_free(&config); return (PAM_SESSION_ERR); @@ -871,7 +823,7 @@ free(dataset); pam_zfs_free(); zfs_key_config_free(&config); - if (pw_clear(pamh, PASSWORD_VAR_NAME) == -1) { + if (pw_clear(pamh) == -1) { return (PAM_SERVICE_ERR); } return (PAM_SUCCESS); @@ -894,7 +846,7 @@ if (zfs_key_config_load(pamh, &config, argc, argv) != PAM_SUCCESS) { return (PAM_SESSION_ERR); } - if (config.uid < config.uid_min || config.uid > config.uid_max) { + if (config.uid < 1000) { zfs_key_config_free(&config); return (PAM_SUCCESS); } @@ -916,7 +868,7 @@ zfs_key_config_free(&config); return (PAM_SESSION_ERR); } - if (unmount_unload(pamh, dataset, config.force_unmount) == -1) { + if (unmount_unload(pamh, dataset) == -1) { free(dataset); pam_zfs_free(); zfs_key_config_free(&config); diff --git a/sys/contrib/openzfs/include/os/freebsd/spl/sys/atomic.h b/sys/contrib/openzfs/include/os/freebsd/spl/sys/atomic.h --- a/sys/contrib/openzfs/include/os/freebsd/spl/sys/atomic.h +++ b/sys/contrib/openzfs/include/os/freebsd/spl/sys/atomic.h @@ -167,7 +167,7 @@ return (atomic_add_64_nv(target, -1)); } -#ifdef __LP64__ +#if !defined(COMPAT_32BIT) && defined(__LP64__) static __inline void * atomic_cas_ptr(volatile void *target, void *cmp, void *newval) { @@ -181,7 +181,7 @@ return ((void *)atomic_cas_32((volatile uint32_t *)target, (uint32_t)cmp, (uint32_t)newval)); } -#endif /* __LP64__ */ +#endif /* !defined(COMPAT_32BIT) && defined(__LP64__) */ #else /* _STANDALONE */ /* diff --git a/sys/contrib/openzfs/include/os/freebsd/spl/sys/kmem.h b/sys/contrib/openzfs/include/os/freebsd/spl/sys/kmem.h --- a/sys/contrib/openzfs/include/os/freebsd/spl/sys/kmem.h +++ b/sys/contrib/openzfs/include/os/freebsd/spl/sys/kmem.h @@ -75,7 +75,7 @@ extern uint64_t spl_kmem_cache_inuse(kmem_cache_t *cache); extern uint64_t spl_kmem_cache_entry_size(kmem_cache_t *cache); -__attribute__((malloc, alloc_size(1))) +__attribute__((alloc_size(1))) void *zfs_kmem_alloc(size_t size, int kmflags); void zfs_kmem_free(void *buf, size_t size); uint64_t kmem_size(void); @@ -83,7 +83,6 @@ int (*constructor)(void *, void *, int), void (*destructor)(void *, void *), void (*reclaim)(void *) __unused, void *private, vmem_t *vmp, int cflags); void kmem_cache_destroy(kmem_cache_t *cache); -__attribute__((malloc)) void *kmem_cache_alloc(kmem_cache_t *cache, int flags); void kmem_cache_free(kmem_cache_t *cache, void *buf); boolean_t kmem_cache_reap_active(void); diff --git a/sys/contrib/openzfs/include/os/linux/kernel/linux/mod_compat.h b/sys/contrib/openzfs/include/os/linux/kernel/linux/mod_compat.h --- a/sys/contrib/openzfs/include/os/linux/kernel/linux/mod_compat.h +++ b/sys/contrib/openzfs/include/os/linux/kernel/linux/mod_compat.h @@ -68,6 +68,7 @@ zfs_trim, zfs_txg, zfs_vdev, + zfs_vdev_cache, zfs_vdev_file, zfs_vdev_mirror, zfs_vnops, diff --git a/sys/contrib/openzfs/include/os/linux/spl/sys/kmem.h b/sys/contrib/openzfs/include/os/linux/spl/sys/kmem.h --- a/sys/contrib/openzfs/include/os/linux/spl/sys/kmem.h +++ b/sys/contrib/openzfs/include/os/linux/spl/sys/kmem.h @@ -31,10 +31,10 @@ #include extern int kmem_debugging(void); -__attribute__((format(printf, 1, 0))) -extern char *kmem_vasprintf(const char *fmt, va_list ap); -__attribute__((format(printf, 1, 2))) -extern char *kmem_asprintf(const char *fmt, ...); +extern char *kmem_vasprintf(const char *fmt, va_list ap) + __attribute__((format(printf, 1, 0))); +extern char *kmem_asprintf(const char *fmt, ...) + __attribute__((format(printf, 1, 2))); extern char *kmem_strdup(const char *str); extern void kmem_strfree(char *str); @@ -186,10 +186,10 @@ #define kmem_free(ptr, sz) spl_kmem_free((ptr), (sz)) #define kmem_cache_reap_active spl_kmem_cache_reap_active -__attribute__((malloc, alloc_size(1))) -extern void *spl_kmem_alloc(size_t sz, int fl, const char *func, int line); -__attribute__((malloc, alloc_size(1))) -extern void *spl_kmem_zalloc(size_t sz, int fl, const char *func, int line); +extern void *spl_kmem_alloc(size_t sz, int fl, const char *func, int line) + __attribute__((alloc_size(1))); +extern void *spl_kmem_zalloc(size_t sz, int fl, const char *func, int line) + __attribute__((alloc_size(1))); extern void spl_kmem_free(const void *ptr, size_t sz); /* diff --git a/sys/contrib/openzfs/include/os/linux/spl/sys/taskq.h b/sys/contrib/openzfs/include/os/linux/spl/sys/taskq.h --- a/sys/contrib/openzfs/include/os/linux/spl/sys/taskq.h +++ b/sys/contrib/openzfs/include/os/linux/spl/sys/taskq.h @@ -104,7 +104,6 @@ /* list node for the cpu hotplug callback */ struct hlist_node tq_hp_cb_node; boolean_t tq_hp_support; - unsigned long lastshouldstop; /* when to purge dynamic */ } taskq_t; typedef struct taskq_ent { diff --git a/sys/contrib/openzfs/include/os/linux/spl/sys/vmem.h b/sys/contrib/openzfs/include/os/linux/spl/sys/vmem.h --- a/sys/contrib/openzfs/include/os/linux/spl/sys/vmem.h +++ b/sys/contrib/openzfs/include/os/linux/spl/sys/vmem.h @@ -91,10 +91,8 @@ #define vmem_zalloc(sz, fl) spl_vmem_zalloc((sz), (fl), __func__, __LINE__) #define vmem_free(ptr, sz) spl_vmem_free((ptr), (sz)) -extern void *spl_vmem_alloc(size_t sz, int fl, const char *func, int line) - __attribute__((malloc, alloc_size(1))); -extern void *spl_vmem_zalloc(size_t sz, int fl, const char *func, int line) - __attribute__((malloc, alloc_size(1))); +extern void *spl_vmem_alloc(size_t sz, int fl, const char *func, int line); +extern void *spl_vmem_zalloc(size_t sz, int fl, const char *func, int line); extern void spl_vmem_free(const void *ptr, size_t sz); int spl_vmem_init(void); diff --git a/sys/contrib/openzfs/include/os/linux/zfs/sys/trace_zil.h b/sys/contrib/openzfs/include/os/linux/zfs/sys/trace_zil.h --- a/sys/contrib/openzfs/include/os/linux/zfs/sys/trace_zil.h +++ b/sys/contrib/openzfs/include/os/linux/zfs/sys/trace_zil.h @@ -215,39 +215,6 @@ TP_ARGS(zilog, zcw)) DEFINE_ZIL_COMMIT_IO_ERROR_EVENT(zfs_zil__commit__io__error); -/* - * Generic support for three argument tracepoints of the form: - * - * DTRACE_PROBE3(..., - * zilog_t *, ..., - * uint64_t, ..., - * uint64_t, ...); - */ -/* BEGIN CSTYLED */ -DECLARE_EVENT_CLASS(zfs_zil_block_size_class, - TP_PROTO(zilog_t *zilog, uint64_t res, uint64_t s1), - TP_ARGS(zilog, res, s1), - TP_STRUCT__entry( - ZILOG_TP_STRUCT_ENTRY - __field(uint64_t, res) - __field(uint64_t, s1) - ), - TP_fast_assign( - ZILOG_TP_FAST_ASSIGN - __entry->res = res; - __entry->s1 = s1; - ), - TP_printk( - ZILOG_TP_PRINTK_FMT " res %llu s1 %llu", - ZILOG_TP_PRINTK_ARGS, __entry->res, __entry->s1) -); - -#define DEFINE_ZIL_BLOCK_SIZE_EVENT(name) \ -DEFINE_EVENT(zfs_zil_block_size_class, name, \ - TP_PROTO(zilog_t *zilog, uint64_t res, uint64_t s1), \ - TP_ARGS(zilog, res, s1)) -DEFINE_ZIL_BLOCK_SIZE_EVENT(zfs_zil__block__size); - #endif /* _TRACE_ZIL_H */ #undef TRACE_INCLUDE_PATH @@ -261,7 +228,6 @@ DEFINE_DTRACE_PROBE2(zil__process__commit__itx); DEFINE_DTRACE_PROBE2(zil__process__normal__itx); DEFINE_DTRACE_PROBE2(zil__commit__io__error); -DEFINE_DTRACE_PROBE3(zil__block__size); #endif /* HAVE_DECLARE_EVENT_CLASS */ #endif /* _KERNEL */ diff --git a/sys/contrib/openzfs/include/sys/abd.h b/sys/contrib/openzfs/include/sys/abd.h --- a/sys/contrib/openzfs/include/sys/abd.h +++ b/sys/contrib/openzfs/include/sys/abd.h @@ -86,15 +86,10 @@ * Allocations and deallocations */ -__attribute__((malloc)) abd_t *abd_alloc(size_t, boolean_t); -__attribute__((malloc)) abd_t *abd_alloc_linear(size_t, boolean_t); -__attribute__((malloc)) abd_t *abd_alloc_gang(void); -__attribute__((malloc)) abd_t *abd_alloc_for_io(size_t, boolean_t); -__attribute__((malloc)) abd_t *abd_alloc_sametype(abd_t *, size_t); boolean_t abd_size_alloc_linear(size_t); void abd_gang_add(abd_t *, abd_t *, boolean_t); diff --git a/sys/contrib/openzfs/include/sys/arc.h b/sys/contrib/openzfs/include/sys/arc.h --- a/sys/contrib/openzfs/include/sys/arc.h +++ b/sys/contrib/openzfs/include/sys/arc.h @@ -304,8 +304,9 @@ zio_t *arc_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, arc_buf_t *buf, boolean_t uncached, boolean_t l2arc, const zio_prop_t *zp, arc_write_done_func_t *ready, arc_write_done_func_t *child_ready, - arc_write_done_func_t *done, void *priv, zio_priority_t priority, - int zio_flags, const zbookmark_phys_t *zb); + arc_write_done_func_t *physdone, arc_write_done_func_t *done, + void *priv, zio_priority_t priority, int zio_flags, + const zbookmark_phys_t *zb); arc_prune_t *arc_add_prune_callback(arc_prune_func_t *func, void *priv); void arc_remove_prune_callback(arc_prune_t *p); diff --git a/sys/contrib/openzfs/include/sys/arc_impl.h b/sys/contrib/openzfs/include/sys/arc_impl.h --- a/sys/contrib/openzfs/include/sys/arc_impl.h +++ b/sys/contrib/openzfs/include/sys/arc_impl.h @@ -123,6 +123,7 @@ void *awcb_private; arc_write_done_func_t *awcb_ready; arc_write_done_func_t *awcb_children_ready; + arc_write_done_func_t *awcb_physdone; arc_write_done_func_t *awcb_done; arc_buf_t *awcb_buf; }; diff --git a/sys/contrib/openzfs/include/sys/btree.h b/sys/contrib/openzfs/include/sys/btree.h --- a/sys/contrib/openzfs/include/sys/btree.h +++ b/sys/contrib/openzfs/include/sys/btree.h @@ -105,13 +105,8 @@ boolean_t bti_before; } zfs_btree_index_t; -typedef struct btree zfs_btree_t; -typedef void * (*bt_find_in_buf_f) (zfs_btree_t *, uint8_t *, uint32_t, - const void *, zfs_btree_index_t *); - -struct btree { +typedef struct btree { int (*bt_compar) (const void *, const void *); - bt_find_in_buf_f bt_find_in_buf; size_t bt_elem_size; size_t bt_leaf_size; uint32_t bt_leaf_cap; @@ -120,54 +115,7 @@ uint64_t bt_num_nodes; zfs_btree_hdr_t *bt_root; zfs_btree_leaf_t *bt_bulk; // non-null if bulk loading -}; - -/* - * Implementation of Shar's algorithm designed to accelerate binary search by - * eliminating impossible to predict branches. - * - * For optimality, this should be used to generate the search function in the - * same file as the comparator and the comparator should be marked - * `__attribute__((always_inline) inline` so that the compiler will inline it. - * - * Arguments are: - * - * NAME - The function name for this instance of the search function. Use it - * in a subsequent call to zfs_btree_create(). - * T - The element type stored inside the B-Tree. - * COMP - A comparator to compare two nodes, it must return exactly: -1, 0, - * or +1 -1 for <, 0 for ==, and +1 for >. For trivial comparisons, - * TREE_CMP() from avl.h can be used in a boilerplate function. - */ -/* BEGIN CSTYLED */ -#define ZFS_BTREE_FIND_IN_BUF_FUNC(NAME, T, COMP) \ -_Pragma("GCC diagnostic push") \ -_Pragma("GCC diagnostic ignored \"-Wunknown-pragmas\"") \ -static void * \ -NAME(zfs_btree_t *tree, uint8_t *buf, uint32_t nelems, \ - const void *value, zfs_btree_index_t *where) \ -{ \ - T *i = (T *)buf; \ - (void) tree; \ - _Pragma("GCC unroll 9") \ - while (nelems > 1) { \ - uint32_t half = nelems / 2; \ - nelems -= half; \ - i += (COMP(&i[half - 1], value) < 0) * half; \ - } \ - \ - int comp = COMP(i, value); \ - where->bti_offset = (i - (T *)buf) + (comp < 0); \ - where->bti_before = (comp != 0); \ - \ - if (comp == 0) { \ - return (i); \ - } \ - \ - return (NULL); \ -} \ -_Pragma("GCC diagnostic pop") -/* END CSTYLED */ +} zfs_btree_t; /* * Allocate and deallocate caches for btree nodes. @@ -181,19 +129,13 @@ * tree - the tree to be initialized * compar - function to compare two nodes, it must return exactly: -1, 0, or +1 * -1 for <, 0 for ==, and +1 for > - * find - optional function to accelerate searches inside B-Tree nodes - * through Shar's algorithm and comparator inlining. Setting this to - * NULL will use a generic function. The function should be created - * using ZFS_BTREE_FIND_IN_BUF_FUNC() in the same file as compar. - * compar should be marked `__attribute__((always_inline)) inline` or - * performance is unlikely to improve very much. * size - the value of sizeof(struct my_type) * lsize - custom leaf size */ void zfs_btree_create(zfs_btree_t *, int (*) (const void *, const void *), - bt_find_in_buf_f, size_t); + size_t); void zfs_btree_create_custom(zfs_btree_t *, int (*)(const void *, const void *), - bt_find_in_buf_f, size_t, size_t); + size_t, size_t); /* * Find a node with a matching value in the tree. Returns the matching node diff --git a/sys/contrib/openzfs/include/sys/dsl_scan.h b/sys/contrib/openzfs/include/sys/dsl_scan.h --- a/sys/contrib/openzfs/include/sys/dsl_scan.h +++ b/sys/contrib/openzfs/include/sys/dsl_scan.h @@ -61,7 +61,7 @@ uint64_t scn_end_time; uint64_t scn_to_examine; /* total bytes to be scanned */ uint64_t scn_examined; /* bytes scanned so far */ - uint64_t scn_skipped; /* bytes skipped by scanner */ + uint64_t scn_to_process; uint64_t scn_processed; uint64_t scn_errors; /* scan I/O error count */ uint64_t scn_ddt_class_max; diff --git a/sys/contrib/openzfs/include/sys/fs/zfs.h b/sys/contrib/openzfs/include/sys/fs/zfs.h --- a/sys/contrib/openzfs/include/sys/fs/zfs.h +++ b/sys/contrib/openzfs/include/sys/fs/zfs.h @@ -1088,7 +1088,7 @@ uint64_t pss_end_time; /* scan end time */ uint64_t pss_to_examine; /* total bytes to scan */ uint64_t pss_examined; /* total bytes located by scanner */ - uint64_t pss_skipped; /* total bytes skipped by scanner */ + uint64_t pss_to_process; /* total bytes to process */ uint64_t pss_processed; /* total processed bytes */ uint64_t pss_errors; /* scan errors */ @@ -1152,7 +1152,6 @@ uint64_t vrs_pass_time_ms; /* pass run time (millisecs) */ uint64_t vrs_pass_bytes_scanned; /* bytes scanned since start/resume */ uint64_t vrs_pass_bytes_issued; /* bytes rebuilt since start/resume */ - uint64_t vrs_pass_bytes_skipped; /* bytes skipped since start/resume */ } vdev_rebuild_stat_t; /* diff --git a/sys/contrib/openzfs/include/sys/spa.h b/sys/contrib/openzfs/include/sys/spa.h --- a/sys/contrib/openzfs/include/sys/spa.h +++ b/sys/contrib/openzfs/include/sys/spa.h @@ -723,10 +723,16 @@ * Send TRIM commands in-line during normal pool operation while deleting. * OFF: no * ON: yes + * NB: IN_FREEBSD_BASE is defined within the FreeBSD sources. */ typedef enum { SPA_AUTOTRIM_OFF = 0, /* default */ SPA_AUTOTRIM_ON, +#ifdef IN_FREEBSD_BASE + SPA_AUTOTRIM_DEFAULT = SPA_AUTOTRIM_ON, +#else + SPA_AUTOTRIM_DEFAULT = SPA_AUTOTRIM_OFF, +#endif } spa_autotrim_t; /* @@ -1168,6 +1174,10 @@ zbookmark_phys_t *zb); extern void name_to_errphys(char *buf, zbookmark_err_phys_t *zep); +/* vdev cache */ +extern void vdev_cache_stat_init(void); +extern void vdev_cache_stat_fini(void); + /* vdev mirror */ extern void vdev_mirror_stat_init(void); extern void vdev_mirror_stat_fini(void); diff --git a/sys/contrib/openzfs/include/sys/vdev.h b/sys/contrib/openzfs/include/sys/vdev.h --- a/sys/contrib/openzfs/include/sys/vdev.h +++ b/sys/contrib/openzfs/include/sys/vdev.h @@ -158,15 +158,20 @@ extern boolean_t vdev_accessible(vdev_t *vd, zio_t *zio); extern boolean_t vdev_is_spacemap_addressable(vdev_t *vd); +extern void vdev_cache_init(vdev_t *vd); +extern void vdev_cache_fini(vdev_t *vd); +extern boolean_t vdev_cache_read(zio_t *zio); +extern void vdev_cache_write(zio_t *zio); +extern void vdev_cache_purge(vdev_t *vd); + extern void vdev_queue_init(vdev_t *vd); extern void vdev_queue_fini(vdev_t *vd); extern zio_t *vdev_queue_io(zio_t *zio); extern void vdev_queue_io_done(zio_t *zio); extern void vdev_queue_change_io_priority(zio_t *zio, zio_priority_t priority); -extern uint32_t vdev_queue_length(vdev_t *vd); +extern int vdev_queue_length(vdev_t *vd); extern uint64_t vdev_queue_last_offset(vdev_t *vd); -extern uint64_t vdev_queue_class_length(vdev_t *vq, zio_priority_t p); extern void vdev_config_dirty(vdev_t *vd); extern void vdev_config_clean(vdev_t *vd); diff --git a/sys/contrib/openzfs/include/sys/vdev_impl.h b/sys/contrib/openzfs/include/sys/vdev_impl.h --- a/sys/contrib/openzfs/include/sys/vdev_impl.h +++ b/sys/contrib/openzfs/include/sys/vdev_impl.h @@ -57,6 +57,8 @@ * Forward declarations that lots of things need. */ typedef struct vdev_queue vdev_queue_t; +typedef struct vdev_cache vdev_cache_t; +typedef struct vdev_cache_entry vdev_cache_entry_t; struct abd; extern uint_t zfs_vdev_queue_depth_pct; @@ -130,24 +132,44 @@ /* * Virtual device properties */ -typedef union vdev_queue_class { - list_t vqc_list; - avl_tree_t vqc_tree; +struct vdev_cache_entry { + struct abd *ve_abd; + uint64_t ve_offset; + clock_t ve_lastused; + avl_node_t ve_offset_node; + avl_node_t ve_lastused_node; + uint32_t ve_hits; + uint16_t ve_missed_update; + zio_t *ve_fill_io; +}; + +struct vdev_cache { + avl_tree_t vc_offset_tree; + avl_tree_t vc_lastused_tree; + kmutex_t vc_lock; +}; + +typedef struct vdev_queue_class { + uint32_t vqc_active; + + /* + * Sorted by offset or timestamp, depending on if the queue is + * LBA-ordered vs FIFO. + */ + avl_tree_t vqc_queued_tree; } vdev_queue_class_t; struct vdev_queue { vdev_t *vq_vdev; vdev_queue_class_t vq_class[ZIO_PRIORITY_NUM_QUEUEABLE]; + avl_tree_t vq_active_tree; avl_tree_t vq_read_offset_tree; avl_tree_t vq_write_offset_tree; + avl_tree_t vq_trim_offset_tree; uint64_t vq_last_offset; zio_priority_t vq_last_prio; /* Last sent I/O priority. */ - uint32_t vq_cqueued; /* Classes with queued I/Os. */ - uint32_t vq_cactive[ZIO_PRIORITY_NUM_QUEUEABLE]; - uint32_t vq_active; /* Number of active I/Os. */ uint32_t vq_ia_active; /* Active interactive I/Os. */ uint32_t vq_nia_credit; /* Non-interactive I/Os credit. */ - list_t vq_active_list; /* List of active I/Os. */ hrtime_t vq_io_complete_ts; /* time last i/o completed */ hrtime_t vq_io_delta_ts; zio_t vq_io_search; /* used as local for stack reduction */ @@ -421,6 +443,7 @@ boolean_t vdev_resilver_deferred; /* resilver deferred */ boolean_t vdev_kobj_flag; /* kobj event record */ vdev_queue_t vdev_queue; /* I/O deadline schedule queue */ + vdev_cache_t vdev_cache; /* physical block cache */ spa_aux_vdev_t *vdev_aux; /* for l2cache and spares vdevs */ zio_t *vdev_probe_zio; /* root of current probe */ vdev_aux_t vdev_label_aux; /* on-disk aux state */ diff --git a/sys/contrib/openzfs/include/sys/vdev_rebuild.h b/sys/contrib/openzfs/include/sys/vdev_rebuild.h --- a/sys/contrib/openzfs/include/sys/vdev_rebuild.h +++ b/sys/contrib/openzfs/include/sys/vdev_rebuild.h @@ -79,7 +79,6 @@ uint64_t vr_pass_start_time; uint64_t vr_pass_bytes_scanned; uint64_t vr_pass_bytes_issued; - uint64_t vr_pass_bytes_skipped; /* On-disk state updated by vdev_rebuild_zap_update_sync() */ vdev_rebuild_phys_t vr_rebuild_phys; diff --git a/sys/contrib/openzfs/include/sys/zfs_refcount.h b/sys/contrib/openzfs/include/sys/zfs_refcount.h --- a/sys/contrib/openzfs/include/sys/zfs_refcount.h +++ b/sys/contrib/openzfs/include/sys/zfs_refcount.h @@ -27,7 +27,6 @@ #define _SYS_ZFS_REFCOUNT_H #include -#include #include #include @@ -44,22 +43,19 @@ #ifdef ZFS_DEBUG typedef struct reference { - union { - avl_node_t a; - list_node_t l; - } ref_link; + list_node_t ref_link; const void *ref_holder; uint64_t ref_number; - boolean_t ref_search; + uint8_t *ref_removed; } reference_t; typedef struct refcount { - uint64_t rc_count; kmutex_t rc_mtx; - avl_tree_t rc_tree; - list_t rc_removed; - uint_t rc_removed_count; boolean_t rc_tracked; + list_t rc_list; + list_t rc_removed; + uint64_t rc_count; + uint64_t rc_removed_count; } zfs_refcount_t; /* @@ -77,15 +73,13 @@ int64_t zfs_refcount_add(zfs_refcount_t *, const void *); int64_t zfs_refcount_remove(zfs_refcount_t *, const void *); /* - * Note that (add|remove)_many adds/removes one reference with "number" N, - * _not_ N references with "number" 1, which is what (add|remove)_few does, - * or what vanilla zfs_refcount_(add|remove) called N times would do. + * Note that (add|remove)_many add/remove one reference with "number" N, + * _not_ make N references with "number" 1, which is what vanilla + * zfs_refcount_(add|remove) would do if called N times. * * Attempting to remove a reference with number N when none exists is a * panic on debug kernels with reference_tracking enabled. */ -void zfs_refcount_add_few(zfs_refcount_t *, uint64_t, const void *); -void zfs_refcount_remove_few(zfs_refcount_t *, uint64_t, const void *); int64_t zfs_refcount_add_many(zfs_refcount_t *, uint64_t, const void *); int64_t zfs_refcount_remove_many(zfs_refcount_t *, uint64_t, const void *); void zfs_refcount_transfer(zfs_refcount_t *, zfs_refcount_t *); @@ -114,10 +108,6 @@ #define zfs_refcount_count(rc) atomic_load_64(&(rc)->rc_count) #define zfs_refcount_add(rc, holder) atomic_inc_64_nv(&(rc)->rc_count) #define zfs_refcount_remove(rc, holder) atomic_dec_64_nv(&(rc)->rc_count) -#define zfs_refcount_add_few(rc, number, holder) \ - atomic_add_64(&(rc)->rc_count, number) -#define zfs_refcount_remove_few(rc, number, holder) \ - atomic_add_64(&(rc)->rc_count, -number) #define zfs_refcount_add_many(rc, number, holder) \ atomic_add_64_nv(&(rc)->rc_count, number) #define zfs_refcount_remove_many(rc, number, holder) \ diff --git a/sys/contrib/openzfs/include/sys/zfs_znode.h b/sys/contrib/openzfs/include/sys/zfs_znode.h --- a/sys/contrib/openzfs/include/sys/zfs_znode.h +++ b/sys/contrib/openzfs/include/sys/zfs_znode.h @@ -158,7 +158,6 @@ #define ZFS_DIRENT_OBJ(de) BF64_GET(de, 0, 48) extern int zfs_obj_to_path(objset_t *osp, uint64_t obj, char *buf, int len); -extern int zfs_get_zplprop(objset_t *os, zfs_prop_t prop, uint64_t *value); #ifdef _KERNEL #include @@ -281,6 +280,7 @@ extern void zfs_remove_op_tables(void); extern int zfs_create_op_tables(void); extern dev_t zfs_cmpldev(uint64_t); +extern int zfs_get_zplprop(objset_t *os, zfs_prop_t prop, uint64_t *value); extern int zfs_get_stats(objset_t *os, nvlist_t *nv); extern boolean_t zfs_get_vfs_flag_unmounted(objset_t *os); extern void zfs_znode_dmu_fini(znode_t *); diff --git a/sys/contrib/openzfs/include/sys/zil.h b/sys/contrib/openzfs/include/sys/zil.h --- a/sys/contrib/openzfs/include/sys/zil.h +++ b/sys/contrib/openzfs/include/sys/zil.h @@ -489,22 +489,18 @@ * Transactions which have been allocated to the "normal" * (i.e. not slog) storage pool. Note that "bytes" accumulate * the actual log record sizes - which do not include the actual - * data in case of indirect writes. bytes <= write <= alloc. + * data in case of indirect writes. */ kstat_named_t zil_itx_metaslab_normal_count; kstat_named_t zil_itx_metaslab_normal_bytes; - kstat_named_t zil_itx_metaslab_normal_write; - kstat_named_t zil_itx_metaslab_normal_alloc; /* * Transactions which have been allocated to the "slog" storage pool. * If there are no separate log devices, this is the same as the - * "normal" pool. bytes <= write <= alloc. + * "normal" pool. */ kstat_named_t zil_itx_metaslab_slog_count; kstat_named_t zil_itx_metaslab_slog_bytes; - kstat_named_t zil_itx_metaslab_slog_write; - kstat_named_t zil_itx_metaslab_slog_alloc; } zil_kstat_values_t; typedef struct zil_sums { @@ -519,12 +515,8 @@ wmsum_t zil_itx_needcopy_bytes; wmsum_t zil_itx_metaslab_normal_count; wmsum_t zil_itx_metaslab_normal_bytes; - wmsum_t zil_itx_metaslab_normal_write; - wmsum_t zil_itx_metaslab_normal_alloc; wmsum_t zil_itx_metaslab_slog_count; wmsum_t zil_itx_metaslab_slog_bytes; - wmsum_t zil_itx_metaslab_slog_write; - wmsum_t zil_itx_metaslab_slog_alloc; } zil_sums_t; #define ZIL_STAT_INCR(zil, stat, val) \ diff --git a/sys/contrib/openzfs/include/sys/zil_impl.h b/sys/contrib/openzfs/include/sys/zil_impl.h --- a/sys/contrib/openzfs/include/sys/zil_impl.h +++ b/sys/contrib/openzfs/include/sys/zil_impl.h @@ -44,7 +44,7 @@ * must be held. * * After the lwb is "opened", it can transition into the "issued" state - * via zil_lwb_write_close(). Again, the zilog's "zl_issuer_lock" must + * via zil_lwb_write_issue(). Again, the zilog's "zl_issuer_lock" must * be held when making this transition. * * After the lwb's write zio completes, it transitions into the "write @@ -93,23 +93,20 @@ blkptr_t lwb_blk; /* on disk address of this log blk */ boolean_t lwb_fastwrite; /* is blk marked for fastwrite? */ boolean_t lwb_slog; /* lwb_blk is on SLOG device */ - boolean_t lwb_indirect; /* do not postpone zil_lwb_commit() */ int lwb_nused; /* # used bytes in buffer */ - int lwb_nfilled; /* # filled bytes in buffer */ int lwb_sz; /* size of block and buffer */ lwb_state_t lwb_state; /* the state of this lwb */ char *lwb_buf; /* log write buffer */ zio_t *lwb_write_zio; /* zio for the lwb buffer */ zio_t *lwb_root_zio; /* root zio for lwb write and flushes */ - hrtime_t lwb_issued_timestamp; /* when was the lwb issued? */ uint64_t lwb_issued_txg; /* the txg when the write is issued */ uint64_t lwb_max_txg; /* highest txg in this lwb */ list_node_t lwb_node; /* zilog->zl_lwb_list linkage */ - list_node_t lwb_issue_node; /* linkage of lwbs ready for issue */ list_t lwb_itxs; /* list of itx's */ list_t lwb_waiters; /* list of zil_commit_waiter's */ avl_tree_t lwb_vdev_tree; /* vdevs to flush after lwb write */ kmutex_t lwb_vdev_lock; /* protects lwb_vdev_tree */ + hrtime_t lwb_issued_timestamp; /* when was the lwb issued? */ } lwb_t; /* diff --git a/sys/contrib/openzfs/include/sys/zio.h b/sys/contrib/openzfs/include/sys/zio.h --- a/sys/contrib/openzfs/include/sys/zio.h +++ b/sys/contrib/openzfs/include/sys/zio.h @@ -190,6 +190,7 @@ #define ZIO_FLAG_SPECULATIVE (1ULL << 8) #define ZIO_FLAG_CONFIG_WRITER (1ULL << 9) #define ZIO_FLAG_DONT_RETRY (1ULL << 10) +#define ZIO_FLAG_DONT_CACHE (1ULL << 11) #define ZIO_FLAG_NODATA (1ULL << 12) #define ZIO_FLAG_INDUCE_DAMAGE (1ULL << 13) #define ZIO_FLAG_IO_ALLOCATING (1ULL << 14) @@ -341,9 +342,9 @@ enum zio_checksum zp_checksum; enum zio_compress zp_compress; uint8_t zp_complevel; + dmu_object_type_t zp_type; uint8_t zp_level; uint8_t zp_copies; - dmu_object_type_t zp_type; boolean_t zp_dedup; boolean_t zp_dedup_verify; boolean_t zp_nopwrite; @@ -436,12 +437,6 @@ list_node_t zl_child_node; } zio_link_t; -enum zio_qstate { - ZIO_QS_NONE = 0, - ZIO_QS_QUEUED, - ZIO_QS_ACTIVE, -}; - struct zio { /* Core information about this I/O */ zbookmark_phys_t io_bookmark; @@ -466,6 +461,7 @@ /* Callback info */ zio_done_func_t *io_ready; zio_done_func_t *io_children_ready; + zio_done_func_t *io_physdone; zio_done_func_t *io_done; void *io_private; int64_t io_prev_space_delta; /* DMU private */ @@ -485,12 +481,6 @@ const zio_vsd_ops_t *io_vsd_ops; metaslab_class_t *io_metaslab_class; /* dva throttle class */ - enum zio_qstate io_queue_state; /* vdev queue state */ - union { - list_node_t l; - avl_node_t a; - } io_queue_node ____cacheline_aligned; /* allocator and vdev queues */ - avl_node_t io_offset_node; /* vdev offset queues */ uint64_t io_offset; hrtime_t io_timestamp; /* submitted at */ hrtime_t io_queued_timestamp; @@ -498,6 +488,9 @@ hrtime_t io_delta; /* vdev queue service delta */ hrtime_t io_delay; /* Device access time (disk or */ /* file). */ + avl_node_t io_queue_node; + avl_node_t io_offset_node; + avl_node_t io_alloc_node; zio_alloc_list_t io_alloc_list; /* Internal pipeline state */ @@ -511,6 +504,9 @@ int io_error; int io_child_error[ZIO_CHILD_TYPES]; uint64_t io_children[ZIO_CHILD_TYPES][ZIO_WAIT_TYPES]; + uint64_t io_child_count; + uint64_t io_phys_children; + uint64_t io_parent_count; uint64_t *io_stall; zio_t *io_gang_leader; zio_gang_node_t *io_gang_tree; @@ -558,8 +554,9 @@ extern zio_t *zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, struct abd *data, uint64_t size, uint64_t psize, const zio_prop_t *zp, zio_done_func_t *ready, zio_done_func_t *children_ready, - zio_done_func_t *done, void *priv, zio_priority_t priority, - zio_flag_t flags, const zbookmark_phys_t *zb); + zio_done_func_t *physdone, zio_done_func_t *done, + void *priv, zio_priority_t priority, zio_flag_t flags, + const zbookmark_phys_t *zb); extern zio_t *zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, struct abd *data, uint64_t size, zio_done_func_t *done, void *priv, @@ -611,7 +608,6 @@ extern zio_t *zio_walk_children(zio_t *pio, zio_link_t **); extern zio_t *zio_unique_parent(zio_t *cio); extern void zio_add_child(zio_t *pio, zio_t *cio); -extern void zio_add_child_first(zio_t *pio, zio_t *cio); extern void *zio_buf_alloc(size_t size); extern void zio_buf_free(void *buf, size_t size); diff --git a/sys/contrib/openzfs/lib/libspl/include/umem.h b/sys/contrib/openzfs/lib/libspl/include/umem.h --- a/sys/contrib/openzfs/lib/libspl/include/umem.h +++ b/sys/contrib/openzfs/lib/libspl/include/umem.h @@ -83,7 +83,7 @@ const char *_umem_options_init(void); const char *_umem_logging_init(void); -__attribute__((malloc, alloc_size(1))) +__attribute__((alloc_size(1))) static inline void * umem_alloc(size_t size, int flags) { @@ -96,7 +96,7 @@ return (ptr); } -__attribute__((malloc, alloc_size(1))) +__attribute__((alloc_size(1))) static inline void * umem_alloc_aligned(size_t size, size_t align, int flags) { @@ -118,7 +118,7 @@ return (ptr); } -__attribute__((malloc, alloc_size(1))) +__attribute__((alloc_size(1))) static inline void * umem_zalloc(size_t size, int flags) { @@ -188,7 +188,6 @@ umem_free(cp, sizeof (umem_cache_t)); } -__attribute__((malloc)) static inline void * umem_cache_alloc(umem_cache_t *cp, int flags) { diff --git a/sys/contrib/openzfs/lib/libzfs/libzfs_dataset.c b/sys/contrib/openzfs/lib/libzfs/libzfs_dataset.c --- a/sys/contrib/openzfs/lib/libzfs/libzfs_dataset.c +++ b/sys/contrib/openzfs/lib/libzfs/libzfs_dataset.c @@ -1789,8 +1789,7 @@ nvlist_t *nvl; int nvl_len = 0; int added_resv = 0; - zfs_prop_t prop; - boolean_t nsprop = B_FALSE; + zfs_prop_t prop = 0; nvpair_t *elem; (void) snprintf(errbuf, sizeof (errbuf), @@ -1837,7 +1836,6 @@ elem = nvlist_next_nvpair(nvl, elem)) { prop = zfs_name_to_prop(nvpair_name(elem)); - nsprop |= zfs_is_namespace_prop(prop); assert(cl_idx < nvl_len); /* @@ -1936,7 +1934,8 @@ * if one of the options handled by the generic * Linux namespace layer has been modified. */ - if (nsprop && zfs_is_mounted(zhp, NULL)) + if (zfs_is_namespace_prop(prop) && + zfs_is_mounted(zhp, NULL)) ret = zfs_mount(zhp, MNTOPT_REMOUNT, 0); } } diff --git a/sys/contrib/openzfs/lib/libzpool/Makefile.am b/sys/contrib/openzfs/lib/libzpool/Makefile.am --- a/sys/contrib/openzfs/lib/libzpool/Makefile.am +++ b/sys/contrib/openzfs/lib/libzpool/Makefile.am @@ -135,6 +135,7 @@ module/zfs/uberblock.c \ module/zfs/unique.c \ module/zfs/vdev.c \ + module/zfs/vdev_cache.c \ module/zfs/vdev_draid.c \ module/zfs/vdev_draid_rand.c \ module/zfs/vdev_indirect.c \ diff --git a/sys/contrib/openzfs/man/man4/spl.4 b/sys/contrib/openzfs/man/man4/spl.4 --- a/sys/contrib/openzfs/man/man4/spl.4 +++ b/sys/contrib/openzfs/man/man4/spl.4 @@ -193,19 +193,4 @@ reading it could cause a lock-up if the list grow too large without limiting the output. "(truncated)" will be shown if the list is larger than the limit. -. -.It Sy spl_taskq_thread_timeout_ms Ns = Ns Sy 10000 Pq uint -(Linux-only) -How long a taskq has to have had no work before we tear it down. -Previously, we would tear down a dynamic taskq worker as soon -as we noticed it had no work, but it was observed that this led -to a lot of churn in tearing down things we then immediately -spawned anew. -In practice, it seems any nonzero value will remove the vast -majority of this churn, while the nontrivially larger value -was chosen to help filter out the little remaining churn on -a mostly idle system. -Setting this value to -.Sy 0 -will revert to the previous behavior. .El diff --git a/sys/contrib/openzfs/man/man4/zfs.4 b/sys/contrib/openzfs/man/man4/zfs.4 --- a/sys/contrib/openzfs/man/man4/zfs.4 +++ b/sys/contrib/openzfs/man/man4/zfs.4 @@ -239,16 +239,6 @@ Make some blocks above a certain size be gang blocks. This option is used by the test suite to facilitate testing. . -.It Sy zfs_ddt_zap_default_bs Ns = Ns Sy 15 Po 32 KiB Pc Pq int -Default DDT ZAP data block size as a power of 2. Note that changing this after -creating a DDT on the pool will not affect existing DDTs, only newly created -ones. -. -.It Sy zfs_ddt_zap_default_ibs Ns = Ns Sy 15 Po 32 KiB Pc Pq int -Default DDT ZAP indirect block size as a power of 2. Note that changing this -after creating a DDT on the pool will not affect existing DDTs, only newly -created ones. -. .It Sy zfs_default_bs Ns = Ns Sy 9 Po 512 B Pc Pq int Default dnode block size as a power of 2. . @@ -2026,12 +2016,33 @@ Flush dirty data to disk at least every this many seconds (maximum TXG duration). . +.It Sy zfs_vdev_aggregate_trim Ns = Ns Sy 0 Ns | Ns 1 Pq uint +Allow TRIM I/O operations to be aggregated. +This is normally not helpful because the extents to be trimmed +will have been already been aggregated by the metaslab. +This option is provided for debugging and performance analysis. +. .It Sy zfs_vdev_aggregation_limit Ns = Ns Sy 1048576 Ns B Po 1 MiB Pc Pq uint Max vdev I/O aggregation size. . .It Sy zfs_vdev_aggregation_limit_non_rotating Ns = Ns Sy 131072 Ns B Po 128 KiB Pc Pq uint Max vdev I/O aggregation size for non-rotating media. . +.It Sy zfs_vdev_cache_bshift Ns = Ns Sy 16 Po 64 KiB Pc Pq uint +Shift size to inflate reads to. +. +.It Sy zfs_vdev_cache_max Ns = Ns Sy 16384 Ns B Po 16 KiB Pc Pq uint +Inflate reads smaller than this value to meet the +.Sy zfs_vdev_cache_bshift +size +.Pq default Sy 64 KiB . +. +.It Sy zfs_vdev_cache_size Ns = Ns Sy 0 Pq uint +Total size of the per-disk cache in bytes. +.Pp +Currently this feature is disabled, as it has been found to not be helpful +for performance and in some cases harmful. +. .It Sy zfs_vdev_mirror_rotating_inc Ns = Ns Sy 0 Pq int A number by which the balancing algorithm increments the load calculation for the purpose of selecting the least busy mirror member when an I/O operation diff --git a/sys/contrib/openzfs/man/man7/zpool-features.7 b/sys/contrib/openzfs/man/man7/zpool-features.7 --- a/sys/contrib/openzfs/man/man7/zpool-features.7 +++ b/sys/contrib/openzfs/man/man7/zpool-features.7 @@ -228,10 +228,8 @@ filesystem_limits hole_birth large_blocks -livelist lz4_compress spacemap_histogram -zpool_checkpoint .No example# Nm zpool Cm create Fl o Sy compatibility Ns = Ns Ar grub2 Ar bootpool Ar vdev .Ed diff --git a/sys/contrib/openzfs/man/man8/zdb.8 b/sys/contrib/openzfs/man/man8/zdb.8 --- a/sys/contrib/openzfs/man/man8/zdb.8 +++ b/sys/contrib/openzfs/man/man8/zdb.8 @@ -14,7 +14,7 @@ .\" Copyright (c) 2017 Lawrence Livermore National Security, LLC. .\" Copyright (c) 2017 Intel Corporation. .\" -.Dd June 27, 2023 +.Dd October 7, 2020 .Dt ZDB 8 .Os . @@ -41,17 +41,9 @@ .Ar poolname Ns Op Ar / Ns Ar dataset Ns | Ns Ar objset-ID .Op Ar object Ns | Ns Ar range Ns … .Nm -.Fl B -.Op Fl e Oo Fl V Oc Oo Fl p Ar path Oc Ns … -.Op Fl U Ar cache -.Op Fl K Ar key -.Ar poolname Ns Ar / Ns Ar objset-ID -.Op Ar backup-flags -.Nm .Fl C .Op Fl A .Op Fl U Ar cache -.Op Ar poolname .Nm .Fl E .Op Fl A @@ -131,22 +123,6 @@ Display statistics regarding the number, size .Pq logical, physical and allocated and deduplication of blocks. -.It Fl B , -backup -Generate a backup stream, similar to -.Nm zfs Cm send , -but for the numeric objset ID, and without opening the dataset. -This can be useful in recovery scenarios if dataset metadata has become -corrupted but the dataset itself is readable. -The optional -.Ar flags -argument is a string of one or more of the letters -.Sy e , -.Sy L , -.Sy c , -and -.Sy w , -which correspond to the same flags in -.Xr zfs-send 8 . .It Fl c , -checksum Verify the checksum of all metadata blocks while printing block statistics .Po see diff --git a/sys/contrib/openzfs/man/man8/zfs-create.8 b/sys/contrib/openzfs/man/man8/zfs-create.8 --- a/sys/contrib/openzfs/man/man8/zfs-create.8 +++ b/sys/contrib/openzfs/man/man8/zfs-create.8 @@ -234,11 +234,14 @@ Print verbose information about the created dataset. .El .El -.Ss ZFS for Swap -Swapping to a ZFS volume is prone to deadlock and not recommended. -See OpenZFS FAQ. -.Pp -Swapping to a file on a ZFS filesystem is not supported. +.Ss ZFS Volumes as Swap +ZFS volumes may be used as swap devices. +After creating the volume with the +.Nm zfs Cm create Fl V +enable the swap area using the +.Xr swapon 8 +command. +Swapping to files on ZFS filesystems is not supported. . .Sh EXAMPLES .\" These are, respectively, examples 1, 10 from zfs.8 diff --git a/sys/contrib/openzfs/man/man8/zpool-create.8 b/sys/contrib/openzfs/man/man8/zpool-create.8 --- a/sys/contrib/openzfs/man/man8/zpool-create.8 +++ b/sys/contrib/openzfs/man/man8/zpool-create.8 @@ -87,13 +87,13 @@ However this check is not robust enough to detect simultaneous attempts to use a new device in different pools, even if .Sy multihost Ns = Sy enabled . -The administrator must ensure that simultaneous invocations of any combination +The administrator must ensure, that simultaneous invocations of any combination of .Nm zpool Cm replace , .Nm zpool Cm create , .Nm zpool Cm add , or -.Nm zpool Cm labelclear +.Nm zpool Cm labelclear , do not refer to the same device. Using the same device in two pools will result in pool corruption. .Pp diff --git a/sys/contrib/openzfs/man/man8/zpool-events.8 b/sys/contrib/openzfs/man/man8/zpool-events.8 --- a/sys/contrib/openzfs/man/man8/zpool-events.8 +++ b/sys/contrib/openzfs/man/man8/zpool-events.8 @@ -456,6 +456,7 @@ ZIO_FLAG_SPECULATIVE:0x00000100 ZIO_FLAG_CONFIG_WRITER:0x00000200 ZIO_FLAG_DONT_RETRY:0x00000400 +ZIO_FLAG_DONT_CACHE:0x00000800 ZIO_FLAG_NODATA:0x00001000 ZIO_FLAG_INDUCE_DAMAGE:0x00002000 diff --git a/sys/contrib/openzfs/man/man8/zpool-scrub.8 b/sys/contrib/openzfs/man/man8/zpool-scrub.8 --- a/sys/contrib/openzfs/man/man8/zpool-scrub.8 +++ b/sys/contrib/openzfs/man/man8/zpool-scrub.8 @@ -26,7 +26,7 @@ .\" Copyright 2017 Nexenta Systems, Inc. .\" Copyright (c) 2017 Open-E, Inc. All Rights Reserved. .\" -.Dd June 22, 2023 +.Dd July 25, 2021 .Dt ZPOOL-SCRUB 8 .Os . @@ -123,7 +123,7 @@ .No # Nm zpool Cm status ... scan: scrub in progress since Sun Jul 25 16:07:49 2021 - 403M / 405M scanned at 100M/s, 68.4M / 405M issued at 10.0M/s + 403M scanned at 100M/s, 68.4M issued at 10.0M/s, 405M total 0B repaired, 16.91% done, 00:00:04 to go ... .Ed diff --git a/sys/contrib/openzfs/module/Kbuild.in b/sys/contrib/openzfs/module/Kbuild.in --- a/sys/contrib/openzfs/module/Kbuild.in +++ b/sys/contrib/openzfs/module/Kbuild.in @@ -34,20 +34,6 @@ ZFS_MODULE_CFLAGS += -Wno-error=frame-larger-than= endif -# Generated binary search code is particularly bad with this optimization. -# Oddly, range_tree.c is not affected when unrolling is not done and dsl_scan.c -# is not affected when unrolling is done. -# Disable it until the following upstream issue is resolved: -# https://github.com/llvm/llvm-project/issues/62790 -ifeq ($(CONFIG_X86),y) -ifeq ($(CONFIG_CC_IS_CLANG),y) -CFLAGS_zfs/dsl_scan.o += -mllvm -x86-cmov-converter=false -CFLAGS_zfs/metaslab.o += -mllvm -x86-cmov-converter=false -CFLAGS_zfs/range_tree.o += -mllvm -x86-cmov-converter=false -CFLAGS_zfs/zap_micro.o += -mllvm -x86-cmov-converter=false -endif -endif - ifneq ($(KBUILD_EXTMOD),) @CONFIG_QAT_TRUE@ZFS_MODULE_CFLAGS += -I@QAT_SRC@/include @CONFIG_QAT_TRUE@KBUILD_EXTRA_SYMBOLS += @QAT_SYMBOLS@ @@ -382,6 +368,7 @@ uberblock.o \ unique.o \ vdev.o \ + vdev_cache.o \ vdev_draid.o \ vdev_draid_rand.o \ vdev_indirect.o \ diff --git a/sys/contrib/openzfs/module/Makefile.bsd b/sys/contrib/openzfs/module/Makefile.bsd --- a/sys/contrib/openzfs/module/Makefile.bsd +++ b/sys/contrib/openzfs/module/Makefile.bsd @@ -308,6 +308,7 @@ uberblock.c \ unique.c \ vdev.c \ + vdev_cache.c \ vdev_draid.c \ vdev_draid_rand.c \ vdev_indirect.c \ @@ -399,20 +400,6 @@ .include -# Generated binary search code is particularly bad with this optimization. -# Oddly, range_tree.c is not affected when unrolling is not done and dsl_scan.c -# is not affected when unrolling is done. -# Disable it until the following upstream issue is resolved: -# https://github.com/llvm/llvm-project/issues/62790 -.if ${CC} == "clang" -.if ${MACHINE_ARCH} == "i386" || ${MACHINE_ARCH} == "amd64" -CFLAGS.dsl_scan.c= -mllvm -x86-cmov-converter=false -CFLAGS.metaslab.c= -mllvm -x86-cmov-converter=false -CFLAGS.range_tree.c= -mllvm -x86-cmov-converter=false -CFLAGS.zap_micro.c= -mllvm -x86-cmov-converter=false -.endif -.endif - CFLAGS.sysctl_os.c= -include ../zfs_config.h CFLAGS.xxhash.c+= -include ${SYSDIR}/sys/_null.h diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/sysctl_os.c b/sys/contrib/openzfs/module/os/freebsd/zfs/sysctl_os.c --- a/sys/contrib/openzfs/module/os/freebsd/zfs/sysctl_os.c +++ b/sys/contrib/openzfs/module/os/freebsd/zfs/sysctl_os.c @@ -872,6 +872,8 @@ "Enable to bypass vdev_validate()."); /* END CSTYLED */ +/* vdev_cache.c */ + /* vdev_mirror.c */ /* vdev_queue.c */ diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_acl.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_acl.c --- a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_acl.c +++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_acl.c @@ -495,8 +495,10 @@ { zfs_acl_node_t *aclnode; - while ((aclnode = list_remove_head(&aclp->z_acl))) + while ((aclnode = list_head(&aclp->z_acl))) { + list_remove(&aclp->z_acl, aclnode); zfs_acl_node_free(aclnode); + } aclp->z_acl_count = 0; aclp->z_acl_bytes = 0; } diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vfsops.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vfsops.c --- a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vfsops.c +++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vfsops.c @@ -2220,6 +2220,92 @@ return (0); } +/* + * Read a property stored within the master node. + */ +int +zfs_get_zplprop(objset_t *os, zfs_prop_t prop, uint64_t *value) +{ + uint64_t *cached_copy = NULL; + + /* + * Figure out where in the objset_t the cached copy would live, if it + * is available for the requested property. + */ + if (os != NULL) { + switch (prop) { + case ZFS_PROP_VERSION: + cached_copy = &os->os_version; + break; + case ZFS_PROP_NORMALIZE: + cached_copy = &os->os_normalization; + break; + case ZFS_PROP_UTF8ONLY: + cached_copy = &os->os_utf8only; + break; + case ZFS_PROP_CASE: + cached_copy = &os->os_casesensitivity; + break; + default: + break; + } + } + if (cached_copy != NULL && *cached_copy != OBJSET_PROP_UNINITIALIZED) { + *value = *cached_copy; + return (0); + } + + /* + * If the property wasn't cached, look up the file system's value for + * the property. For the version property, we look up a slightly + * different string. + */ + const char *pname; + int error = ENOENT; + if (prop == ZFS_PROP_VERSION) { + pname = ZPL_VERSION_STR; + } else { + pname = zfs_prop_to_name(prop); + } + + if (os != NULL) { + ASSERT3U(os->os_phys->os_type, ==, DMU_OST_ZFS); + error = zap_lookup(os, MASTER_NODE_OBJ, pname, 8, 1, value); + } + + if (error == ENOENT) { + /* No value set, use the default value */ + switch (prop) { + case ZFS_PROP_VERSION: + *value = ZPL_VERSION; + break; + case ZFS_PROP_NORMALIZE: + case ZFS_PROP_UTF8ONLY: + *value = 0; + break; + case ZFS_PROP_CASE: + *value = ZFS_CASE_SENSITIVE; + break; + case ZFS_PROP_ACLTYPE: + *value = ZFS_ACLTYPE_NFSV4; + break; + default: + return (error); + } + error = 0; + } + + /* + * If one of the methods for getting the property value above worked, + * copy it into the objset_t's cache. + */ + if (error == 0 && cached_copy != NULL) { + *cached_copy = *value; + } + + return (error); +} + /* * Return true if the corresponding vfs's unmounted flag is set. * Otherwise return false. diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_znode.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_znode.c --- a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_znode.c +++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_znode.c @@ -2069,93 +2069,6 @@ return (error); } -/* - * Read a property stored within the master node. - */ -int -zfs_get_zplprop(objset_t *os, zfs_prop_t prop, uint64_t *value) -{ - uint64_t *cached_copy = NULL; - - /* - * Figure out where in the objset_t the cached copy would live, if it - * is available for the requested property. - */ - if (os != NULL) { - switch (prop) { - case ZFS_PROP_VERSION: - cached_copy = &os->os_version; - break; - case ZFS_PROP_NORMALIZE: - cached_copy = &os->os_normalization; - break; - case ZFS_PROP_UTF8ONLY: - cached_copy = &os->os_utf8only; - break; - case ZFS_PROP_CASE: - cached_copy = &os->os_casesensitivity; - break; - default: - break; - } - } - if (cached_copy != NULL && *cached_copy != OBJSET_PROP_UNINITIALIZED) { - *value = *cached_copy; - return (0); - } - - /* - * If the property wasn't cached, look up the file system's value for - * the property. For the version property, we look up a slightly - * different string. - */ - const char *pname; - int error = ENOENT; - if (prop == ZFS_PROP_VERSION) { - pname = ZPL_VERSION_STR; - } else { - pname = zfs_prop_to_name(prop); - } - - if (os != NULL) { - ASSERT3U(os->os_phys->os_type, ==, DMU_OST_ZFS); - error = zap_lookup(os, MASTER_NODE_OBJ, pname, 8, 1, value); - } - - if (error == ENOENT) { - /* No value set, use the default value */ - switch (prop) { - case ZFS_PROP_VERSION: - *value = ZPL_VERSION; - break; - case ZFS_PROP_NORMALIZE: - case ZFS_PROP_UTF8ONLY: - *value = 0; - break; - case ZFS_PROP_CASE: - *value = ZFS_CASE_SENSITIVE; - break; - case ZFS_PROP_ACLTYPE: - *value = ZFS_ACLTYPE_NFSV4; - break; - default: - return (error); - } - error = 0; - } - - /* - * If one of the methods for getting the property value above worked, - * copy it into the objset_t's cache. - */ - if (error == 0 && cached_copy != NULL) { - *cached_copy = *value; - } - - return (error); -} - - void zfs_znode_update_vfs(znode_t *zp) diff --git a/sys/contrib/openzfs/module/os/linux/spl/spl-kmem-cache.c b/sys/contrib/openzfs/module/os/linux/spl/spl-kmem-cache.c --- a/sys/contrib/openzfs/module/os/linux/spl/spl-kmem-cache.c +++ b/sys/contrib/openzfs/module/os/linux/spl/spl-kmem-cache.c @@ -182,11 +182,8 @@ * of that infrastructure we are responsible for incrementing it. */ if (current->reclaim_state) -#ifdef HAVE_RECLAIM_STATE_RECLAIMED - current->reclaim_state->reclaimed += size >> PAGE_SHIFT; -#else current->reclaim_state->reclaimed_slab += size >> PAGE_SHIFT; -#endif + vfree(ptr); } @@ -1015,18 +1012,8 @@ ASSERT0(flags & ~KM_PUBLIC_MASK); ASSERT(skc->skc_magic == SKC_MAGIC); ASSERT((skc->skc_flags & KMC_SLAB) == 0); - - *obj = NULL; - - /* - * Since we can't sleep attempt an emergency allocation to satisfy - * the request. The only alterative is to fail the allocation but - * it's preferable try. The use of KM_NOSLEEP is expected to be rare. - */ - if (flags & KM_NOSLEEP) - return (spl_emergency_alloc(skc, flags, obj)); - might_sleep(); + *obj = NULL; /* * Before allocating a new slab wait for any reaping to complete and diff --git a/sys/contrib/openzfs/module/os/linux/spl/spl-taskq.c b/sys/contrib/openzfs/module/os/linux/spl/spl-taskq.c --- a/sys/contrib/openzfs/module/os/linux/spl/spl-taskq.c +++ b/sys/contrib/openzfs/module/os/linux/spl/spl-taskq.c @@ -36,12 +36,6 @@ module_param(spl_taskq_thread_bind, int, 0644); MODULE_PARM_DESC(spl_taskq_thread_bind, "Bind taskq thread to CPU by default"); -static uint_t spl_taskq_thread_timeout_ms = 10000; -/* BEGIN CSTYLED */ -module_param(spl_taskq_thread_timeout_ms, uint, 0644); -/* END CSTYLED */ -MODULE_PARM_DESC(spl_taskq_thread_timeout_ms, - "Time to require a dynamic thread be idle before it gets cleaned up"); static int spl_taskq_thread_dynamic = 1; module_param(spl_taskq_thread_dynamic, int, 0444); @@ -854,37 +848,12 @@ tqt_thread_list) == tqt) return (0); - int no_work = + return ((tq->tq_nspawn == 0) && /* No threads are being spawned */ (tq->tq_nactive == 0) && /* No threads are handling tasks */ (tq->tq_nthreads > 1) && /* More than 1 thread is running */ (!taskq_next_ent(tq)) && /* There are no pending tasks */ (spl_taskq_thread_dynamic)); /* Dynamic taskqs are allowed */ - - /* - * If we would have said stop before, let's instead wait a bit, maybe - * we'll see more work come our way soon... - */ - if (no_work) { - /* if it's 0, we want the old behavior. */ - /* if the taskq is being torn down, we also want to go away. */ - if (spl_taskq_thread_timeout_ms == 0 || - !(tq->tq_flags & TASKQ_ACTIVE)) - return (1); - unsigned long lasttime = tq->lastshouldstop; - if (lasttime > 0) { - if (time_after(jiffies, lasttime + - msecs_to_jiffies(spl_taskq_thread_timeout_ms))) - return (1); - else - return (0); - } else { - tq->lastshouldstop = jiffies; - } - } else { - tq->lastshouldstop = 0; - } - return (0); } static int @@ -1122,7 +1091,6 @@ tq->tq_flags = (flags | TASKQ_ACTIVE); tq->tq_next_id = TASKQID_INITIAL; tq->tq_lowest_id = TASKQID_INITIAL; - tq->lastshouldstop = 0; INIT_LIST_HEAD(&tq->tq_free_list); INIT_LIST_HEAD(&tq->tq_pend_list); INIT_LIST_HEAD(&tq->tq_prio_list); diff --git a/sys/contrib/openzfs/module/os/linux/zfs/arc_os.c b/sys/contrib/openzfs/module/os/linux/zfs/arc_os.c --- a/sys/contrib/openzfs/module/os/linux/zfs/arc_os.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/arc_os.c @@ -219,11 +219,7 @@ arc_reduce_target_size(ptob(sc->nr_to_scan)); arc_wait_for_eviction(ptob(sc->nr_to_scan), B_FALSE); if (current->reclaim_state != NULL) -#ifdef HAVE_RECLAIM_STATE_RECLAIMED - current->reclaim_state->reclaimed += sc->nr_to_scan; -#else current->reclaim_state->reclaimed_slab += sc->nr_to_scan; -#endif /* * We are experiencing memory pressure which the arc_evict_zthr was diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_acl.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_acl.c --- a/sys/contrib/openzfs/module/os/linux/zfs/zfs_acl.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_acl.c @@ -493,8 +493,10 @@ { zfs_acl_node_t *aclnode; - while ((aclnode = list_remove_head(&aclp->z_acl))) + while ((aclnode = list_head(&aclp->z_acl))) { + list_remove(&aclp->z_acl, aclnode); zfs_acl_node_free(aclnode); + } aclp->z_acl_count = 0; aclp->z_acl_bytes = 0; } diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_vfsops.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_vfsops.c --- a/sys/contrib/openzfs/module/os/linux/zfs/zfs_vfsops.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_vfsops.c @@ -2052,6 +2052,91 @@ return (0); } +/* + * Read a property stored within the master node. + */ +int +zfs_get_zplprop(objset_t *os, zfs_prop_t prop, uint64_t *value) +{ + uint64_t *cached_copy = NULL; + + /* + * Figure out where in the objset_t the cached copy would live, if it + * is available for the requested property. + */ + if (os != NULL) { + switch (prop) { + case ZFS_PROP_VERSION: + cached_copy = &os->os_version; + break; + case ZFS_PROP_NORMALIZE: + cached_copy = &os->os_normalization; + break; + case ZFS_PROP_UTF8ONLY: + cached_copy = &os->os_utf8only; + break; + case ZFS_PROP_CASE: + cached_copy = &os->os_casesensitivity; + break; + default: + break; + } + } + if (cached_copy != NULL && *cached_copy != OBJSET_PROP_UNINITIALIZED) { + *value = *cached_copy; + return (0); + } + + /* + * If the property wasn't cached, look up the file system's value for + * the property. For the version property, we look up a slightly + * different string. + */ + const char *pname; + int error = ENOENT; + if (prop == ZFS_PROP_VERSION) + pname = ZPL_VERSION_STR; + else + pname = zfs_prop_to_name(prop); + + if (os != NULL) { + ASSERT3U(os->os_phys->os_type, ==, DMU_OST_ZFS); + error = zap_lookup(os, MASTER_NODE_OBJ, pname, 8, 1, value); + } + + if (error == ENOENT) { + /* No value set, use the default value */ + switch (prop) { + case ZFS_PROP_VERSION: + *value = ZPL_VERSION; + break; + case ZFS_PROP_NORMALIZE: + case ZFS_PROP_UTF8ONLY: + *value = 0; + break; + case ZFS_PROP_CASE: + *value = ZFS_CASE_SENSITIVE; + break; + case ZFS_PROP_ACLTYPE: + *value = ZFS_ACLTYPE_OFF; + break; + default: + return (error); + } + error = 0; + } + + /* + * If one of the methods for getting the property value above worked, + * copy it into the objset_t's cache. + */ + if (error == 0 && cached_copy != NULL) { + *cached_copy = *value; + } + + return (error); +} + /* * Return true if the corresponding vfs's unmounted flag is set. * Otherwise return false. diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_znode.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_znode.c --- a/sys/contrib/openzfs/module/os/linux/zfs/zfs_znode.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_znode.c @@ -2254,91 +2254,6 @@ return (error); } -/* - * Read a property stored within the master node. - */ -int -zfs_get_zplprop(objset_t *os, zfs_prop_t prop, uint64_t *value) -{ - uint64_t *cached_copy = NULL; - - /* - * Figure out where in the objset_t the cached copy would live, if it - * is available for the requested property. - */ - if (os != NULL) { - switch (prop) { - case ZFS_PROP_VERSION: - cached_copy = &os->os_version; - break; - case ZFS_PROP_NORMALIZE: - cached_copy = &os->os_normalization; - break; - case ZFS_PROP_UTF8ONLY: - cached_copy = &os->os_utf8only; - break; - case ZFS_PROP_CASE: - cached_copy = &os->os_casesensitivity; - break; - default: - break; - } - } - if (cached_copy != NULL && *cached_copy != OBJSET_PROP_UNINITIALIZED) { - *value = *cached_copy; - return (0); - } - - /* - * If the property wasn't cached, look up the file system's value for - * the property. For the version property, we look up a slightly - * different string. - */ - const char *pname; - int error = ENOENT; - if (prop == ZFS_PROP_VERSION) - pname = ZPL_VERSION_STR; - else - pname = zfs_prop_to_name(prop); - - if (os != NULL) { - ASSERT3U(os->os_phys->os_type, ==, DMU_OST_ZFS); - error = zap_lookup(os, MASTER_NODE_OBJ, pname, 8, 1, value); - } - - if (error == ENOENT) { - /* No value set, use the default value */ - switch (prop) { - case ZFS_PROP_VERSION: - *value = ZPL_VERSION; - break; - case ZFS_PROP_NORMALIZE: - case ZFS_PROP_UTF8ONLY: - *value = 0; - break; - case ZFS_PROP_CASE: - *value = ZFS_CASE_SENSITIVE; - break; - case ZFS_PROP_ACLTYPE: - *value = ZFS_ACLTYPE_OFF; - break; - default: - return (error); - } - error = 0; - } - - /* - * If one of the methods for getting the property value above worked, - * copy it into the objset_t's cache. - */ - if (error == 0 && cached_copy != NULL) { - *cached_copy = *value; - } - - return (error); -} - #if defined(_KERNEL) EXPORT_SYMBOL(zfs_create_fs); EXPORT_SYMBOL(zfs_obj_to_path); diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c b/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c --- a/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c @@ -54,7 +54,7 @@ static unsigned long zvol_max_discard_blocks = 16384; #ifndef HAVE_BLKDEV_GET_ERESTARTSYS -static unsigned int zvol_open_timeout_ms = 1000; +static const unsigned int zvol_open_timeout_ms = 1000; #endif static unsigned int zvol_threads = 0; @@ -1612,9 +1612,4 @@ "Process volblocksize blocks per thread"); #endif -#ifndef HAVE_BLKDEV_GET_ERESTARTSYS -module_param(zvol_open_timeout_ms, uint, 0644); -MODULE_PARM_DESC(zvol_open_timeout_ms, "Timeout for ZVOL open retries"); -#endif - /* END CSTYLED */ diff --git a/sys/contrib/openzfs/module/zcommon/zpool_prop.c b/sys/contrib/openzfs/module/zcommon/zpool_prop.c --- a/sys/contrib/openzfs/module/zcommon/zpool_prop.c +++ b/sys/contrib/openzfs/module/zcommon/zpool_prop.c @@ -160,7 +160,7 @@ "wait | continue | panic", "FAILMODE", failuremode_table, sfeatures); zprop_register_index(ZPOOL_PROP_AUTOTRIM, "autotrim", - SPA_AUTOTRIM_OFF, PROP_DEFAULT, ZFS_TYPE_POOL, + SPA_AUTOTRIM_DEFAULT, PROP_DEFAULT, ZFS_TYPE_POOL, "on | off", "AUTOTRIM", boolean_table, sfeatures); /* hidden properties */ diff --git a/sys/contrib/openzfs/module/zfs/arc.c b/sys/contrib/openzfs/module/zfs/arc.c --- a/sys/contrib/openzfs/module/zfs/arc.c +++ b/sys/contrib/openzfs/module/zfs/arc.c @@ -965,7 +965,7 @@ l2arc_dev_t *dev); /* L2ARC persistence write I/O routines. */ -static uint64_t l2arc_log_blk_commit(l2arc_dev_t *dev, zio_t *pio, +static void l2arc_log_blk_commit(l2arc_dev_t *dev, zio_t *pio, l2arc_write_callback_t *cb); /* L2ARC persistence auxiliary routines. */ @@ -6106,7 +6106,8 @@ asize, abd, ZIO_CHECKSUM_OFF, l2arc_read_done, cb, priority, - zio_flags | ZIO_FLAG_CANFAIL | + zio_flags | ZIO_FLAG_DONT_CACHE | + ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY, B_FALSE); acb->acb_zio_head = rzio; @@ -6675,6 +6676,18 @@ callback->awcb_children_ready(zio, buf, callback->awcb_private); } +/* + * The SPA calls this callback for each physical write that happens on behalf + * of a logical write. See the comment in dbuf_write_physdone() for details. + */ +static void +arc_write_physdone(zio_t *zio) +{ + arc_write_callback_t *cb = zio->io_private; + if (cb->awcb_physdone != NULL) + cb->awcb_physdone(zio, cb->awcb_buf, cb->awcb_private); +} + static void arc_write_done(zio_t *zio) { @@ -6764,9 +6777,9 @@ arc_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, arc_buf_t *buf, boolean_t uncached, boolean_t l2arc, const zio_prop_t *zp, arc_write_done_func_t *ready, - arc_write_done_func_t *children_ready, arc_write_done_func_t *done, - void *private, zio_priority_t priority, int zio_flags, - const zbookmark_phys_t *zb) + arc_write_done_func_t *children_ready, arc_write_done_func_t *physdone, + arc_write_done_func_t *done, void *private, zio_priority_t priority, + int zio_flags, const zbookmark_phys_t *zb) { arc_buf_hdr_t *hdr = buf->b_hdr; arc_write_callback_t *callback; @@ -6813,6 +6826,7 @@ callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP); callback->awcb_ready = ready; callback->awcb_children_ready = children_ready; + callback->awcb_physdone = physdone; callback->awcb_done = done; callback->awcb_private = private; callback->awcb_buf = buf; @@ -6849,7 +6863,8 @@ abd_get_from_buf(buf->b_data, HDR_GET_LSIZE(hdr)), HDR_GET_LSIZE(hdr), arc_buf_size(buf), &localprop, arc_write_ready, (children_ready != NULL) ? arc_write_children_ready : NULL, - arc_write_done, callback, priority, zio_flags, zb); + arc_write_physdone, arc_write_done, callback, + priority, zio_flags, zb); return (zio); } @@ -7851,7 +7866,8 @@ taskq_destroy(arc_prune_taskq); mutex_enter(&arc_prune_mtx); - while ((p = list_remove_head(&arc_prune_list)) != NULL) { + while ((p = list_head(&arc_prune_list)) != NULL) { + list_remove(&arc_prune_list, p); zfs_refcount_remove(&p->p_refcnt, &arc_prune_list); zfs_refcount_destroy(&p->p_refcnt); kmem_free(p, sizeof (*p)); @@ -8159,7 +8175,7 @@ static uint64_t l2arc_write_size(l2arc_dev_t *dev) { - uint64_t size; + uint64_t size, dev_size, tsize; /* * Make sure our globals have meaningful values in case the user @@ -8176,45 +8192,35 @@ if (arc_warm == B_FALSE) size += l2arc_write_boost; + /* + * Make sure the write size does not exceed the size of the cache + * device. This is important in l2arc_evict(), otherwise infinite + * iteration can occur. + */ + dev_size = dev->l2ad_end - dev->l2ad_start; + /* We need to add in the worst case scenario of log block overhead. */ - size += l2arc_log_blk_overhead(size, dev); + tsize = size + l2arc_log_blk_overhead(size, dev); if (dev->l2ad_vdev->vdev_has_trim && l2arc_trim_ahead > 0) { /* * Trim ahead of the write size 64MB or (l2arc_trim_ahead/100) * times the writesize, whichever is greater. */ - size += MAX(64 * 1024 * 1024, - (size * l2arc_trim_ahead) / 100); + tsize += MAX(64 * 1024 * 1024, + (tsize * l2arc_trim_ahead) / 100); } - /* - * Make sure the write size does not exceed the size of the cache - * device. This is important in l2arc_evict(), otherwise infinite - * iteration can occur. - */ - if (size > dev->l2ad_end - dev->l2ad_start) { + if (tsize >= dev_size) { cmn_err(CE_NOTE, "l2arc_write_max or l2arc_write_boost " "plus the overhead of log blocks (persistent L2ARC, " "%llu bytes) exceeds the size of the cache device " "(guid %llu), resetting them to the default (%d)", (u_longlong_t)l2arc_log_blk_overhead(size, dev), (u_longlong_t)dev->l2ad_vdev->vdev_guid, L2ARC_WRITE_SIZE); - size = l2arc_write_max = l2arc_write_boost = L2ARC_WRITE_SIZE; - if (l2arc_trim_ahead > 1) { - cmn_err(CE_NOTE, "l2arc_trim_ahead set to 1"); - l2arc_trim_ahead = 1; - } - if (arc_warm == B_FALSE) size += l2arc_write_boost; - - size += l2arc_log_blk_overhead(size, dev); - if (dev->l2ad_vdev->vdev_has_trim && l2arc_trim_ahead > 0) { - size += MAX(64 * 1024 * 1024, - (size * l2arc_trim_ahead) / 100); - } } return (size); @@ -8313,14 +8319,20 @@ static void l2arc_do_free_on_write(void) { - l2arc_data_free_t *df; + list_t *buflist; + l2arc_data_free_t *df, *df_prev; mutex_enter(&l2arc_free_on_write_mtx); - while ((df = list_remove_head(l2arc_free_on_write)) != NULL) { + buflist = l2arc_free_on_write; + + for (df = list_tail(buflist); df; df = df_prev) { + df_prev = list_prev(buflist, df); ASSERT3P(df->l2df_abd, !=, NULL); abd_free(df->l2df_abd); + list_remove(buflist, df); kmem_free(df, sizeof (l2arc_data_free_t)); } + mutex_exit(&l2arc_free_on_write_mtx); } @@ -8833,7 +8845,7 @@ top: rerun = B_FALSE; - if (dev->l2ad_hand + distance > dev->l2ad_end) { + if (dev->l2ad_hand >= (dev->l2ad_end - distance)) { /* * When there is no space to accommodate upcoming writes, * evict to the end. Then bump the write and evict hands @@ -9027,7 +9039,7 @@ */ ASSERT3U(dev->l2ad_hand + distance, <, dev->l2ad_end); if (!dev->l2ad_first) - ASSERT3U(dev->l2ad_hand, <=, dev->l2ad_evict); + ASSERT3U(dev->l2ad_hand, <, dev->l2ad_evict); } } @@ -9287,13 +9299,7 @@ uint64_t asize = vdev_psize_to_asize(dev->l2ad_vdev, psize); - /* - * If the allocated size of this buffer plus the max - * size for the pending log block exceeds the evicted - * target size, terminate writing buffers for this run. - */ - if (write_asize + asize + - sizeof (l2arc_log_blk_phys_t) > target_sz) { + if ((write_asize + asize) > target_sz) { full = B_TRUE; mutex_exit(hash_lock); break; @@ -9407,14 +9413,8 @@ * arcstat_l2_{size,asize} kstats are updated * internally. */ - if (l2arc_log_blk_insert(dev, hdr)) { - /* - * l2ad_hand will be adjusted in - * l2arc_log_blk_commit(). - */ - write_asize += - l2arc_log_blk_commit(dev, pio, cb); - } + if (l2arc_log_blk_insert(dev, hdr)) + l2arc_log_blk_commit(dev, pio, cb); zio_nowait(wzio); } @@ -10173,7 +10173,8 @@ err = zio_wait(zio_read_phys(NULL, dev->l2ad_vdev, VDEV_LABEL_START_SIZE, l2dhdr_asize, abd, ZIO_CHECKSUM_LABEL, NULL, NULL, ZIO_PRIORITY_SYNC_READ, - ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY | + ZIO_FLAG_DONT_CACHE | ZIO_FLAG_CANFAIL | + ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY | ZIO_FLAG_SPECULATIVE, B_FALSE)); abd_free(abd); @@ -10493,10 +10494,11 @@ cb = kmem_zalloc(sizeof (l2arc_read_callback_t), KM_SLEEP); cb->l2rcb_abd = abd_get_from_buf(lb, asize); pio = zio_root(vd->vdev_spa, l2arc_blk_fetch_done, cb, - ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY); + ZIO_FLAG_DONT_CACHE | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | + ZIO_FLAG_DONT_RETRY); (void) zio_nowait(zio_read_phys(pio, vd, lbp->lbp_daddr, asize, cb->l2rcb_abd, ZIO_CHECKSUM_OFF, NULL, NULL, - ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL | + ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_DONT_CACHE | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY, B_FALSE)); return (pio); @@ -10562,7 +10564,7 @@ * This function allocates some memory to temporarily hold the serialized * buffer to be written. This is then released in l2arc_write_done. */ -static uint64_t +static void l2arc_log_blk_commit(l2arc_dev_t *dev, zio_t *pio, l2arc_write_callback_t *cb) { l2arc_log_blk_phys_t *lb = &dev->l2ad_log_blk; @@ -10673,8 +10675,6 @@ dev->l2ad_log_ent_idx = 0; dev->l2ad_log_blk_payload_asize = 0; dev->l2ad_log_blk_payload_start = 0; - - return (asize); } /* diff --git a/sys/contrib/openzfs/module/zfs/bplist.c b/sys/contrib/openzfs/module/zfs/bplist.c --- a/sys/contrib/openzfs/module/zfs/bplist.c +++ b/sys/contrib/openzfs/module/zfs/bplist.c @@ -65,8 +65,9 @@ bplist_entry_t *bpe; mutex_enter(&bpl->bpl_lock); - while ((bpe = list_remove_head(&bpl->bpl_list))) { + while ((bpe = list_head(&bpl->bpl_list))) { bplist_iterate_last_removed = bpe; + list_remove(&bpl->bpl_list, bpe); mutex_exit(&bpl->bpl_lock); func(arg, &bpe->bpe_blk, tx); kmem_free(bpe, sizeof (*bpe)); @@ -81,7 +82,10 @@ bplist_entry_t *bpe; mutex_enter(&bpl->bpl_lock); - while ((bpe = list_remove_head(&bpl->bpl_list))) + while ((bpe = list_head(&bpl->bpl_list))) { + bplist_iterate_last_removed = bpe; + list_remove(&bpl->bpl_list, bpe); kmem_free(bpe, sizeof (*bpe)); + } mutex_exit(&bpl->bpl_lock); } diff --git a/sys/contrib/openzfs/module/zfs/btree.c b/sys/contrib/openzfs/module/zfs/btree.c --- a/sys/contrib/openzfs/module/zfs/btree.c +++ b/sys/contrib/openzfs/module/zfs/btree.c @@ -193,20 +193,14 @@ void zfs_btree_create(zfs_btree_t *tree, int (*compar) (const void *, const void *), - bt_find_in_buf_f bt_find_in_buf, size_t size) + size_t size) { - zfs_btree_create_custom(tree, compar, bt_find_in_buf, size, - BTREE_LEAF_SIZE); + zfs_btree_create_custom(tree, compar, size, BTREE_LEAF_SIZE); } -static void * -zfs_btree_find_in_buf(zfs_btree_t *tree, uint8_t *buf, uint32_t nelems, - const void *value, zfs_btree_index_t *where); - void zfs_btree_create_custom(zfs_btree_t *tree, int (*compar) (const void *, const void *), - bt_find_in_buf_f bt_find_in_buf, size_t size, size_t lsize) { size_t esize = lsize - offsetof(zfs_btree_leaf_t, btl_elems); @@ -214,8 +208,6 @@ ASSERT3U(size, <=, esize / 2); memset(tree, 0, sizeof (*tree)); tree->bt_compar = compar; - tree->bt_find_in_buf = (bt_find_in_buf == NULL) ? - zfs_btree_find_in_buf : bt_find_in_buf; tree->bt_elem_size = size; tree->bt_leaf_size = lsize; tree->bt_leaf_cap = P2ALIGN(esize / size, 2); @@ -311,7 +303,7 @@ * element in the last leaf, it's in the last leaf or * it's not in the tree. */ - void *d = tree->bt_find_in_buf(tree, + void *d = zfs_btree_find_in_buf(tree, last_leaf->btl_elems + last_leaf->btl_hdr.bth_first * size, last_leaf->btl_hdr.bth_count, value, &idx); @@ -335,7 +327,7 @@ for (node = (zfs_btree_core_t *)tree->bt_root; depth < tree->bt_height; node = (zfs_btree_core_t *)node->btc_children[child], depth++) { ASSERT3P(node, !=, NULL); - void *d = tree->bt_find_in_buf(tree, node->btc_elems, + void *d = zfs_btree_find_in_buf(tree, node->btc_elems, node->btc_hdr.bth_count, value, &idx); EQUIV(d != NULL, !idx.bti_before); if (d != NULL) { @@ -355,7 +347,7 @@ */ zfs_btree_leaf_t *leaf = (depth == 0 ? (zfs_btree_leaf_t *)tree->bt_root : (zfs_btree_leaf_t *)node); - void *d = tree->bt_find_in_buf(tree, leaf->btl_elems + + void *d = zfs_btree_find_in_buf(tree, leaf->btl_elems + leaf->btl_hdr.bth_first * size, leaf->btl_hdr.bth_count, value, &idx); @@ -679,7 +671,7 @@ zfs_btree_hdr_t *par_hdr = &parent->btc_hdr; zfs_btree_index_t idx; ASSERT(zfs_btree_is_core(par_hdr)); - VERIFY3P(tree->bt_find_in_buf(tree, parent->btc_elems, + VERIFY3P(zfs_btree_find_in_buf(tree, parent->btc_elems, par_hdr->bth_count, buf, &idx), ==, NULL); ASSERT(idx.bti_before); uint32_t offset = idx.bti_offset; @@ -905,7 +897,7 @@ } zfs_btree_index_t idx; zfs_btree_core_t *parent = hdr->bth_parent; - VERIFY3P(tree->bt_find_in_buf(tree, parent->btc_elems, + VERIFY3P(zfs_btree_find_in_buf(tree, parent->btc_elems, parent->btc_hdr.bth_count, buf, &idx), ==, NULL); ASSERT(idx.bti_before); ASSERT3U(idx.bti_offset, <=, parent->btc_hdr.bth_count); diff --git a/sys/contrib/openzfs/module/zfs/dataset_kstats.c b/sys/contrib/openzfs/module/zfs/dataset_kstats.c --- a/sys/contrib/openzfs/module/zfs/dataset_kstats.c +++ b/sys/contrib/openzfs/module/zfs/dataset_kstats.c @@ -49,12 +49,8 @@ { "zil_itx_needcopy_bytes", KSTAT_DATA_UINT64 }, { "zil_itx_metaslab_normal_count", KSTAT_DATA_UINT64 }, { "zil_itx_metaslab_normal_bytes", KSTAT_DATA_UINT64 }, - { "zil_itx_metaslab_normal_write", KSTAT_DATA_UINT64 }, - { "zil_itx_metaslab_normal_alloc", KSTAT_DATA_UINT64 }, { "zil_itx_metaslab_slog_count", KSTAT_DATA_UINT64 }, - { "zil_itx_metaslab_slog_bytes", KSTAT_DATA_UINT64 }, - { "zil_itx_metaslab_slog_write", KSTAT_DATA_UINT64 }, - { "zil_itx_metaslab_slog_alloc", KSTAT_DATA_UINT64 } + { "zil_itx_metaslab_slog_bytes", KSTAT_DATA_UINT64 } } }; diff --git a/sys/contrib/openzfs/module/zfs/dbuf.c b/sys/contrib/openzfs/module/zfs/dbuf.c --- a/sys/contrib/openzfs/module/zfs/dbuf.c +++ b/sys/contrib/openzfs/module/zfs/dbuf.c @@ -4369,6 +4369,22 @@ rw_exit(&parent_db->db_rwlock); } +static void +dbuf_lightweight_physdone(zio_t *zio) +{ + dbuf_dirty_record_t *dr = zio->io_private; + dsl_pool_t *dp = spa_get_dsl(zio->io_spa); + ASSERT3U(dr->dr_txg, ==, zio->io_txg); + + /* + * The callback will be called io_phys_children times. Retire one + * portion of our dirty space each time we are called. Any rounding + * error will be cleaned up by dbuf_lightweight_done(). + */ + int delta = dr->dr_accounted / zio->io_phys_children; + dsl_pool_undirty_space(dp, delta, zio->io_txg); +} + static void dbuf_lightweight_done(zio_t *zio) { @@ -4387,8 +4403,16 @@ dsl_dataset_block_born(ds, zio->io_bp, tx); } - dsl_pool_undirty_space(dmu_objset_pool(os), dr->dr_accounted, - zio->io_txg); + /* + * See comment in dbuf_write_done(). + */ + if (zio->io_phys_children == 0) { + dsl_pool_undirty_space(dmu_objset_pool(os), + dr->dr_accounted, zio->io_txg); + } else { + dsl_pool_undirty_space(dmu_objset_pool(os), + dr->dr_accounted % zio->io_phys_children, zio->io_txg); + } abd_free(dr->dt.dll.dr_abd); kmem_free(dr, sizeof (*dr)); @@ -4422,7 +4446,8 @@ dmu_tx_get_txg(tx), &dr->dr_bp_copy, dr->dt.dll.dr_abd, dn->dn_datablksz, abd_get_size(dr->dt.dll.dr_abd), &dr->dt.dll.dr_props, dbuf_lightweight_ready, NULL, - dbuf_lightweight_done, dr, ZIO_PRIORITY_ASYNC_WRITE, + dbuf_lightweight_physdone, dbuf_lightweight_done, dr, + ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED | dr->dt.dll.dr_flags, &zb); zio_nowait(dr->dr_zio); @@ -4764,6 +4789,37 @@ DB_DNODE_EXIT(db); } +/* + * The SPA will call this callback several times for each zio - once + * for every physical child i/o (zio->io_phys_children times). This + * allows the DMU to monitor the progress of each logical i/o. For example, + * there may be 2 copies of an indirect block, or many fragments of a RAID-Z + * block. There may be a long delay before all copies/fragments are completed, + * so this callback allows us to retire dirty space gradually, as the physical + * i/os complete. + */ +static void +dbuf_write_physdone(zio_t *zio, arc_buf_t *buf, void *arg) +{ + (void) buf; + dmu_buf_impl_t *db = arg; + objset_t *os = db->db_objset; + dsl_pool_t *dp = dmu_objset_pool(os); + dbuf_dirty_record_t *dr; + int delta = 0; + + dr = db->db_data_pending; + ASSERT3U(dr->dr_txg, ==, zio->io_txg); + + /* + * The callback will be called io_phys_children times. Retire one + * portion of our dirty space each time we are called. Any rounding + * error will be cleaned up by dbuf_write_done(). + */ + delta = dr->dr_accounted / zio->io_phys_children; + dsl_pool_undirty_space(dp, delta, zio->io_txg); +} + static void dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb) { @@ -4838,8 +4894,27 @@ db->db_data_pending = NULL; dbuf_rele_and_unlock(db, (void *)(uintptr_t)tx->tx_txg, B_FALSE); - dsl_pool_undirty_space(dmu_objset_pool(os), dr->dr_accounted, - zio->io_txg); + /* + * If we didn't do a physical write in this ZIO and we + * still ended up here, it means that the space of the + * dbuf that we just released (and undirtied) above hasn't + * been marked as undirtied in the pool's accounting. + * + * Thus, we undirty that space in the pool's view of the + * world here. For physical writes this type of update + * happens in dbuf_write_physdone(). + * + * If we did a physical write, cleanup any rounding errors + * that came up due to writing multiple copies of a block + * on disk [see dbuf_write_physdone()]. + */ + if (zio->io_phys_children == 0) { + dsl_pool_undirty_space(dmu_objset_pool(os), + dr->dr_accounted, zio->io_txg); + } else { + dsl_pool_undirty_space(dmu_objset_pool(os), + dr->dr_accounted % zio->io_phys_children, zio->io_txg); + } kmem_free(dr, sizeof (dbuf_dirty_record_t)); } @@ -5087,7 +5162,7 @@ dr->dr_zio = zio_write(pio, os->os_spa, txg, &dr->dr_bp_copy, contents, db->db.db_size, db->db.db_size, &zp, - dbuf_write_override_ready, NULL, + dbuf_write_override_ready, NULL, NULL, dbuf_write_override_done, dr, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb); mutex_enter(&db->db_mtx); @@ -5101,7 +5176,7 @@ zp.zp_checksum == ZIO_CHECKSUM_NOPARITY); dr->dr_zio = zio_write(pio, os->os_spa, txg, &dr->dr_bp_copy, NULL, db->db.db_size, db->db.db_size, &zp, - dbuf_write_nofill_ready, NULL, + dbuf_write_nofill_ready, NULL, NULL, dbuf_write_nofill_done, db, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED | ZIO_FLAG_NODATA, &zb); @@ -5120,8 +5195,9 @@ dr->dr_zio = arc_write(pio, os->os_spa, txg, &dr->dr_bp_copy, data, !DBUF_IS_CACHEABLE(db), dbuf_is_l2cacheable(db), &zp, dbuf_write_ready, - children_ready_cb, dbuf_write_done, db, - ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb); + children_ready_cb, dbuf_write_physdone, + dbuf_write_done, db, ZIO_PRIORITY_ASYNC_WRITE, + ZIO_FLAG_MUSTSUCCEED, &zb); } } diff --git a/sys/contrib/openzfs/module/zfs/ddt.c b/sys/contrib/openzfs/module/zfs/ddt.c --- a/sys/contrib/openzfs/module/zfs/ddt.c +++ b/sys/contrib/openzfs/module/zfs/ddt.c @@ -1209,19 +1209,10 @@ ASSERT3S(dde->dde_class, <, DDT_CLASSES); ddp = &dde->dde_phys[BP_GET_NDVAS(bp)]; - - /* - * This entry already existed (dde_type is real), so it must - * have refcnt >0 at the start of this txg. We are called from - * brt_pending_apply(), before frees are issued, so the refcnt - * can't be lowered yet. Therefore, it must be >0. We assert - * this because if the order of BRT and DDT interactions were - * ever to change and the refcnt was ever zero here, then - * likely further action is required to fill out the DDT entry, - * and this is a place that is likely to be missed in testing. - */ - ASSERT3U(ddp->ddp_refcnt, >, 0); - + if (ddp->ddp_refcnt == 0) { + /* This should never happen? */ + ddt_phys_fill(ddp, bp); + } ddt_phys_addref(ddp); result = B_TRUE; } else { diff --git a/sys/contrib/openzfs/module/zfs/ddt_zap.c b/sys/contrib/openzfs/module/zfs/ddt_zap.c --- a/sys/contrib/openzfs/module/zfs/ddt_zap.c +++ b/sys/contrib/openzfs/module/zfs/ddt_zap.c @@ -31,8 +31,8 @@ #include #include -static unsigned int ddt_zap_default_bs = 15; -static unsigned int ddt_zap_default_ibs = 15; +static const int ddt_zap_leaf_blockshift = 12; +static const int ddt_zap_indirect_blockshift = 12; static int ddt_zap_create(objset_t *os, uint64_t *objectp, dmu_tx_t *tx, boolean_t prehash) @@ -43,7 +43,7 @@ flags |= ZAP_FLAG_PRE_HASHED_KEY; *objectp = zap_create_flags(os, 0, flags, DMU_OT_DDT_ZAP, - ddt_zap_default_bs, ddt_zap_default_ibs, + ddt_zap_leaf_blockshift, ddt_zap_indirect_blockshift, DMU_OT_NONE, 0, tx); return (*objectp == 0 ? SET_ERROR(ENOTSUP) : 0); @@ -166,10 +166,3 @@ ddt_zap_walk, ddt_zap_count, }; - -/* BEGIN CSTYLED */ -ZFS_MODULE_PARAM(zfs_dedup, , ddt_zap_default_bs, UINT, ZMOD_RW, - "DDT ZAP leaf blockshift"); -ZFS_MODULE_PARAM(zfs_dedup, , ddt_zap_default_ibs, UINT, ZMOD_RW, - "DDT ZAP indirect blockshift"); -/* END CSTYLED */ diff --git a/sys/contrib/openzfs/module/zfs/dmu.c b/sys/contrib/openzfs/module/zfs/dmu.c --- a/sys/contrib/openzfs/module/zfs/dmu.c +++ b/sys/contrib/openzfs/module/zfs/dmu.c @@ -1698,7 +1698,7 @@ zio_nowait(zio_write(pio, os->os_spa, dmu_tx_get_txg(tx), zgd->zgd_bp, abd_get_from_buf(zgd->zgd_db->db_data, zgd->zgd_db->db_size), zgd->zgd_db->db_size, zgd->zgd_db->db_size, zp, - dmu_sync_late_arrival_ready, NULL, dmu_sync_late_arrival_done, + dmu_sync_late_arrival_ready, NULL, NULL, dmu_sync_late_arrival_done, dsa, ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, zb)); return (0); @@ -1864,7 +1864,7 @@ zio_nowait(arc_write(pio, os->os_spa, txg, zgd->zgd_bp, dr->dt.dl.dr_data, !DBUF_IS_CACHEABLE(db), dbuf_is_l2cacheable(db), - &zp, dmu_sync_ready, NULL, dmu_sync_done, dsa, + &zp, dmu_sync_ready, NULL, NULL, dmu_sync_done, dsa, ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, &zb)); return (0); diff --git a/sys/contrib/openzfs/module/zfs/dmu_objset.c b/sys/contrib/openzfs/module/zfs/dmu_objset.c --- a/sys/contrib/openzfs/module/zfs/dmu_objset.c +++ b/sys/contrib/openzfs/module/zfs/dmu_objset.c @@ -1698,7 +1698,7 @@ zio = arc_write(pio, os->os_spa, tx->tx_txg, blkptr_copy, os->os_phys_buf, B_FALSE, dmu_os_is_l2cacheable(os), - &zp, dmu_objset_write_ready, NULL, dmu_objset_write_done, + &zp, dmu_objset_write_ready, NULL, NULL, dmu_objset_write_done, os, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb); /* @@ -1755,8 +1755,9 @@ taskq_wait(dmu_objset_pool(os)->dp_sync_taskq); list = &DMU_META_DNODE(os)->dn_dirty_records[txgoff]; - while ((dr = list_remove_head(list)) != NULL) { + while ((dr = list_head(list)) != NULL) { ASSERT0(dr->dr_dbuf->db_level); + list_remove(list, dr); zio_nowait(dr->dr_zio); } diff --git a/sys/contrib/openzfs/module/zfs/dmu_recv.c b/sys/contrib/openzfs/module/zfs/dmu_recv.c --- a/sys/contrib/openzfs/module/zfs/dmu_recv.c +++ b/sys/contrib/openzfs/module/zfs/dmu_recv.c @@ -1371,8 +1371,8 @@ dnode_t *dn; abd_t *abd = rrd->abd; zio_cksum_t bp_cksum = bp->blk_cksum; - zio_flag_t flags = ZIO_FLAG_SPECULATIVE | ZIO_FLAG_DONT_RETRY | - ZIO_FLAG_CANFAIL; + zio_flag_t flags = ZIO_FLAG_SPECULATIVE | + ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_RETRY | ZIO_FLAG_CANFAIL; if (rwa->raw) flags |= ZIO_FLAG_RAW; diff --git a/sys/contrib/openzfs/module/zfs/dmu_send.c b/sys/contrib/openzfs/module/zfs/dmu_send.c --- a/sys/contrib/openzfs/module/zfs/dmu_send.c +++ b/sys/contrib/openzfs/module/zfs/dmu_send.c @@ -1955,7 +1955,7 @@ { dsl_dataset_t *to_ds = dspp->to_ds; dsl_pool_t *dp = dspp->dp; - +#ifdef _KERNEL if (dmu_objset_type(os) == DMU_OST_ZFS) { uint64_t version; if (zfs_get_zplprop(os, ZFS_PROP_VERSION, &version) != 0) @@ -1964,6 +1964,7 @@ if (version >= ZPL_VERSION_SA) *featureflags |= DMU_BACKUP_FEATURE_SA_SPILL; } +#endif /* raw sends imply large_block_ok */ if ((dspp->rawok || dspp->large_block_ok) && @@ -2792,7 +2793,6 @@ } if (err == 0) { - owned = B_TRUE; err = zap_lookup(dspp.dp->dp_meta_objset, dspp.to_ds->ds_object, DS_FIELD_RESUME_TOGUID, 8, 1, @@ -2806,24 +2806,21 @@ sizeof (dspp.saved_toname), dspp.saved_toname); } - /* Only disown if there was an error in the lookups */ - if (owned && (err != 0)) + if (err != 0) dsl_dataset_disown(dspp.to_ds, dsflags, FTAG); kmem_strfree(name); } else { err = dsl_dataset_own(dspp.dp, tosnap, dsflags, FTAG, &dspp.to_ds); - if (err == 0) - owned = B_TRUE; } + owned = B_TRUE; } else { err = dsl_dataset_hold_flags(dspp.dp, tosnap, dsflags, FTAG, &dspp.to_ds); } if (err != 0) { - /* Note: dsl dataset is not owned at this point */ dsl_pool_rele(dspp.dp, FTAG); return (err); } diff --git a/sys/contrib/openzfs/module/zfs/dmu_tx.c b/sys/contrib/openzfs/module/zfs/dmu_tx.c --- a/sys/contrib/openzfs/module/zfs/dmu_tx.c +++ b/sys/contrib/openzfs/module/zfs/dmu_tx.c @@ -1396,7 +1396,8 @@ { dmu_tx_callback_t *dcb; - while ((dcb = list_remove_tail(cb_list)) != NULL) { + while ((dcb = list_tail(cb_list)) != NULL) { + list_remove(cb_list, dcb); dcb->dcb_func(dcb->dcb_data, error); kmem_free(dcb, sizeof (dmu_tx_callback_t)); } diff --git a/sys/contrib/openzfs/module/zfs/dmu_zfetch.c b/sys/contrib/openzfs/module/zfs/dmu_zfetch.c --- a/sys/contrib/openzfs/module/zfs/dmu_zfetch.c +++ b/sys/contrib/openzfs/module/zfs/dmu_zfetch.c @@ -520,7 +520,8 @@ issued = pf_end - pf_start + ipf_end - ipf_start; if (issued > 1) { /* More references on top of taken in dmu_zfetch_prepare(). */ - zfs_refcount_add_few(&zs->zs_refs, issued - 1, NULL); + for (int i = 0; i < issued - 1; i++) + zfs_refcount_add(&zs->zs_refs, NULL); } else if (issued == 0) { /* Some other thread has done our work, so drop the ref. */ if (zfs_refcount_remove(&zs->zs_refs, NULL) == 0) diff --git a/sys/contrib/openzfs/module/zfs/dsl_dataset.c b/sys/contrib/openzfs/module/zfs/dsl_dataset.c --- a/sys/contrib/openzfs/module/zfs/dsl_dataset.c +++ b/sys/contrib/openzfs/module/zfs/dsl_dataset.c @@ -3782,7 +3782,8 @@ if (l == NULL || !list_link_active(&l->list_head)) return; - while ((snap = list_remove_tail(l)) != NULL) { + while ((snap = list_tail(l)) != NULL) { + list_remove(l, snap); dsl_dataset_rele(snap->ds, tag); kmem_free(snap, sizeof (*snap)); } diff --git a/sys/contrib/openzfs/module/zfs/dsl_dir.c b/sys/contrib/openzfs/module/zfs/dsl_dir.c --- a/sys/contrib/openzfs/module/zfs/dsl_dir.c +++ b/sys/contrib/openzfs/module/zfs/dsl_dir.c @@ -1490,7 +1490,7 @@ if (tr_cookie == NULL) return; - while ((tr = list_remove_head(tr_list)) != NULL) { + while ((tr = list_head(tr_list)) != NULL) { if (tr->tr_ds) { mutex_enter(&tr->tr_ds->dd_lock); ASSERT3U(tr->tr_ds->dd_tempreserved[txgidx], >=, @@ -1500,6 +1500,7 @@ } else { arc_tempreserve_clear(tr->tr_size); } + list_remove(tr_list, tr); kmem_free(tr, sizeof (struct tempreserve)); } diff --git a/sys/contrib/openzfs/module/zfs/dsl_scan.c b/sys/contrib/openzfs/module/zfs/dsl_scan.c --- a/sys/contrib/openzfs/module/zfs/dsl_scan.c +++ b/sys/contrib/openzfs/module/zfs/dsl_scan.c @@ -234,7 +234,7 @@ static int zfs_free_bpobj_enabled = 1; /* Error blocks to be scrubbed in one txg. */ -static uint_t zfs_scrub_error_blocks_per_txg = 1 << 12; +uint_t zfs_scrub_error_blocks_per_txg = 1 << 12; /* the order has to match pool_scan_type */ static scan_cb_t *scan_funcs[POOL_SCAN_FUNCS] = { @@ -573,8 +573,7 @@ * counter to how far we've scanned. We know we're consistent * up to here. */ - scn->scn_issued_before_pass = scn->scn_phys.scn_examined - - scn->scn_phys.scn_skipped; + scn->scn_issued_before_pass = scn->scn_phys.scn_examined; if (dsl_scan_is_running(scn) && spa_prev_software_version(dp->dp_spa) < SPA_VERSION_SCAN) { @@ -3438,8 +3437,10 @@ * If we were suspended in the middle of processing, * requeue any unfinished sios and exit. */ - while ((sio = list_remove_head(&sio_list)) != NULL) + while ((sio = list_head(&sio_list)) != NULL) { + list_remove(&sio_list, sio); scan_io_queue_insert_impl(queue, sio); + } queue->q_zio = NULL; mutex_exit(q_lock); @@ -4363,7 +4364,7 @@ * Disabled by default, set zfs_scan_report_txgs to report * average performance over the last zfs_scan_report_txgs TXGs. */ - if (zfs_scan_report_txgs != 0 && + if (!dsl_scan_is_paused_scrub(scn) && zfs_scan_report_txgs != 0 && tx->tx_txg % zfs_scan_report_txgs == 0) { scn->scn_issued_before_pass += spa->spa_scan_pass_issued; spa_scan_stat_init(spa); @@ -4565,15 +4566,6 @@ all ? BP_GET_ASIZE(bp) : DVA_GET_ASIZE(&bp->blk_dva[0])); } -static void -count_block_skipped(dsl_scan_t *scn, const blkptr_t *bp, boolean_t all) -{ - if (BP_IS_EMBEDDED(bp)) - return; - atomic_add_64(&scn->scn_phys.scn_skipped, - all ? BP_GET_ASIZE(bp) : DVA_GET_ASIZE(&bp->blk_dva[0])); -} - static void count_block(zfs_all_blkstats_t *zab, const blkptr_t *bp) { @@ -4719,7 +4711,7 @@ count_block(dp->dp_blkstats, bp); if (phys_birth <= scn->scn_phys.scn_min_txg || phys_birth >= scn->scn_phys.scn_max_txg) { - count_block_skipped(scn, bp, B_TRUE); + count_block_issued(spa, bp, B_TRUE); return (0); } @@ -4760,7 +4752,7 @@ if (needs_io && !zfs_no_scrub_io) { dsl_scan_enqueue(dp, bp, zio_flags, zb); } else { - count_block_skipped(scn, bp, B_TRUE); + count_block_issued(spa, bp, B_TRUE); } /* do not relocate this block */ @@ -4885,7 +4877,6 @@ * with single operation. Plus it makes scrubs more sequential and reduces * chances that minor extent change move it within the B-tree. */ -__attribute__((always_inline)) inline static int ext_size_compare(const void *x, const void *y) { @@ -4894,17 +4885,13 @@ return (TREE_CMP(*a, *b)); } -ZFS_BTREE_FIND_IN_BUF_FUNC(ext_size_find_in_buf, uint64_t, - ext_size_compare) - static void ext_size_create(range_tree_t *rt, void *arg) { (void) rt; zfs_btree_t *size_tree = arg; - zfs_btree_create(size_tree, ext_size_compare, ext_size_find_in_buf, - sizeof (uint64_t)); + zfs_btree_create(size_tree, ext_size_compare, sizeof (uint64_t)); } static void @@ -5129,9 +5116,9 @@ ASSERT(range_tree_contains(queue->q_exts_by_addr, start, size)); range_tree_remove_fill(queue->q_exts_by_addr, start, size); - /* count the block as though we skipped it */ + /* count the block as though we issued it */ sio2bp(sio, &tmpbp); - count_block_skipped(scn, &tmpbp, B_FALSE); + count_block_issued(spa, &tmpbp, B_FALSE); sio_free(sio); } diff --git a/sys/contrib/openzfs/module/zfs/fm.c b/sys/contrib/openzfs/module/zfs/fm.c --- a/sys/contrib/openzfs/module/zfs/fm.c +++ b/sys/contrib/openzfs/module/zfs/fm.c @@ -148,7 +148,8 @@ list_remove(&zevent_list, ev); /* Remove references to this event in all private file data */ - while ((ze = list_remove_head(&ev->ev_ze_list)) != NULL) { + while ((ze = list_head(&ev->ev_ze_list)) != NULL) { + list_remove(&ev->ev_ze_list, ze); ze->ze_zevent = NULL; ze->ze_dropped++; } diff --git a/sys/contrib/openzfs/module/zfs/metaslab.c b/sys/contrib/openzfs/module/zfs/metaslab.c --- a/sys/contrib/openzfs/module/zfs/metaslab.c +++ b/sys/contrib/openzfs/module/zfs/metaslab.c @@ -1342,7 +1342,6 @@ * Comparison function for the private size-ordered tree using 32-bit * ranges. Tree is sorted by size, larger sizes at the end of the tree. */ -__attribute__((always_inline)) inline static int metaslab_rangesize32_compare(const void *x1, const void *x2) { @@ -1353,15 +1352,16 @@ uint64_t rs_size2 = r2->rs_end - r2->rs_start; int cmp = TREE_CMP(rs_size1, rs_size2); + if (likely(cmp)) + return (cmp); - return (cmp + !cmp * TREE_CMP(r1->rs_start, r2->rs_start)); + return (TREE_CMP(r1->rs_start, r2->rs_start)); } /* * Comparison function for the private size-ordered tree using 64-bit * ranges. Tree is sorted by size, larger sizes at the end of the tree. */ -__attribute__((always_inline)) inline static int metaslab_rangesize64_compare(const void *x1, const void *x2) { @@ -1372,10 +1372,11 @@ uint64_t rs_size2 = r2->rs_end - r2->rs_start; int cmp = TREE_CMP(rs_size1, rs_size2); + if (likely(cmp)) + return (cmp); - return (cmp + !cmp * TREE_CMP(r1->rs_start, r2->rs_start)); + return (TREE_CMP(r1->rs_start, r2->rs_start)); } - typedef struct metaslab_rt_arg { zfs_btree_t *mra_bt; uint32_t mra_floor_shift; @@ -1411,13 +1412,6 @@ range_tree_walk(rt, metaslab_size_sorted_add, &arg); } - -ZFS_BTREE_FIND_IN_BUF_FUNC(metaslab_rt_find_rangesize32_in_buf, - range_seg32_t, metaslab_rangesize32_compare) - -ZFS_BTREE_FIND_IN_BUF_FUNC(metaslab_rt_find_rangesize64_in_buf, - range_seg64_t, metaslab_rangesize64_compare) - /* * Create any block allocator specific components. The current allocators * rely on using both a size-ordered range_tree_t and an array of uint64_t's. @@ -1430,22 +1424,19 @@ size_t size; int (*compare) (const void *, const void *); - bt_find_in_buf_f bt_find; switch (rt->rt_type) { case RANGE_SEG32: size = sizeof (range_seg32_t); compare = metaslab_rangesize32_compare; - bt_find = metaslab_rt_find_rangesize32_in_buf; break; case RANGE_SEG64: size = sizeof (range_seg64_t); compare = metaslab_rangesize64_compare; - bt_find = metaslab_rt_find_rangesize64_in_buf; break; default: panic("Invalid range seg type %d", rt->rt_type); } - zfs_btree_create(size_tree, compare, bt_find, size); + zfs_btree_create(size_tree, compare, size); mrap->mra_floor_shift = metaslab_by_size_min_shift; } @@ -5650,7 +5641,8 @@ * We reserve the slots individually so that we can unreserve * them individually when an I/O completes. */ - zfs_refcount_add_few(&mca->mca_alloc_slots, slots, zio); + for (int d = 0; d < slots; d++) + zfs_refcount_add(&mca->mca_alloc_slots, zio); zio->io_flags |= ZIO_FLAG_IO_ALLOCATING; return (B_TRUE); } @@ -5664,7 +5656,8 @@ metaslab_class_allocator_t *mca = &mc->mc_allocator[allocator]; ASSERT(mc->mc_alloc_throttle_enabled); - zfs_refcount_remove_few(&mca->mca_alloc_slots, slots, zio); + for (int d = 0; d < slots; d++) + zfs_refcount_remove(&mca->mca_alloc_slots, zio); } static int diff --git a/sys/contrib/openzfs/module/zfs/range_tree.c b/sys/contrib/openzfs/module/zfs/range_tree.c --- a/sys/contrib/openzfs/module/zfs/range_tree.c +++ b/sys/contrib/openzfs/module/zfs/range_tree.c @@ -151,7 +151,6 @@ rt->rt_histogram[idx]--; } -__attribute__((always_inline)) inline static int range_tree_seg32_compare(const void *x1, const void *x2) { @@ -164,7 +163,6 @@ return ((r1->rs_start >= r2->rs_end) - (r1->rs_end <= r2->rs_start)); } -__attribute__((always_inline)) inline static int range_tree_seg64_compare(const void *x1, const void *x2) { @@ -177,7 +175,6 @@ return ((r1->rs_start >= r2->rs_end) - (r1->rs_end <= r2->rs_start)); } -__attribute__((always_inline)) inline static int range_tree_seg_gap_compare(const void *x1, const void *x2) { @@ -190,15 +187,6 @@ return ((r1->rs_start >= r2->rs_end) - (r1->rs_end <= r2->rs_start)); } -ZFS_BTREE_FIND_IN_BUF_FUNC(range_tree_seg32_find_in_buf, range_seg32_t, - range_tree_seg32_compare) - -ZFS_BTREE_FIND_IN_BUF_FUNC(range_tree_seg64_find_in_buf, range_seg64_t, - range_tree_seg64_compare) - -ZFS_BTREE_FIND_IN_BUF_FUNC(range_tree_seg_gap_find_in_buf, range_seg_gap_t, - range_tree_seg_gap_compare) - range_tree_t * range_tree_create_gap(const range_tree_ops_t *ops, range_seg_type_t type, void *arg, uint64_t start, uint64_t shift, uint64_t gap) @@ -209,27 +197,23 @@ ASSERT3U(type, <=, RANGE_SEG_NUM_TYPES); size_t size; int (*compare) (const void *, const void *); - bt_find_in_buf_f bt_find; switch (type) { case RANGE_SEG32: size = sizeof (range_seg32_t); compare = range_tree_seg32_compare; - bt_find = range_tree_seg32_find_in_buf; break; case RANGE_SEG64: size = sizeof (range_seg64_t); compare = range_tree_seg64_compare; - bt_find = range_tree_seg64_find_in_buf; break; case RANGE_SEG_GAP: size = sizeof (range_seg_gap_t); compare = range_tree_seg_gap_compare; - bt_find = range_tree_seg_gap_find_in_buf; break; default: panic("Invalid range seg type %d", type); } - zfs_btree_create(&rt->rt_root, compare, bt_find, size); + zfs_btree_create(&rt->rt_root, compare, size); rt->rt_ops = ops; rt->rt_gap = gap; diff --git a/sys/contrib/openzfs/module/zfs/refcount.c b/sys/contrib/openzfs/module/zfs/refcount.c --- a/sys/contrib/openzfs/module/zfs/refcount.c +++ b/sys/contrib/openzfs/module/zfs/refcount.c @@ -36,40 +36,33 @@ static uint_t reference_history = 3; /* tunable */ static kmem_cache_t *reference_cache; +static kmem_cache_t *reference_history_cache; void zfs_refcount_init(void) { reference_cache = kmem_cache_create("reference_cache", sizeof (reference_t), 0, NULL, NULL, NULL, NULL, NULL, 0); + + reference_history_cache = kmem_cache_create("reference_history_cache", + sizeof (uint64_t), 0, NULL, NULL, NULL, NULL, NULL, 0); } void zfs_refcount_fini(void) { kmem_cache_destroy(reference_cache); -} - -static int -zfs_refcount_compare(const void *x1, const void *x2) -{ - const reference_t *r1 = (const reference_t *)x1; - const reference_t *r2 = (const reference_t *)x2; - - int cmp1 = TREE_CMP(r1->ref_holder, r2->ref_holder); - int cmp2 = TREE_CMP(r1->ref_number, r2->ref_number); - int cmp = cmp1 ? cmp1 : cmp2; - return ((cmp || r1->ref_search) ? cmp : TREE_PCMP(r1, r2)); + kmem_cache_destroy(reference_history_cache); } void zfs_refcount_create(zfs_refcount_t *rc) { mutex_init(&rc->rc_mtx, NULL, MUTEX_DEFAULT, NULL); - avl_create(&rc->rc_tree, zfs_refcount_compare, sizeof (reference_t), - offsetof(reference_t, ref_link.a)); + list_create(&rc->rc_list, sizeof (reference_t), + offsetof(reference_t, ref_link)); list_create(&rc->rc_removed, sizeof (reference_t), - offsetof(reference_t, ref_link.l)); + offsetof(reference_t, ref_link)); rc->rc_count = 0; rc->rc_removed_count = 0; rc->rc_tracked = reference_tracking_enable; @@ -93,15 +86,19 @@ zfs_refcount_destroy_many(zfs_refcount_t *rc, uint64_t number) { reference_t *ref; - void *cookie = NULL; ASSERT3U(rc->rc_count, ==, number); - while ((ref = avl_destroy_nodes(&rc->rc_tree, &cookie)) != NULL) + while ((ref = list_head(&rc->rc_list))) { + list_remove(&rc->rc_list, ref); kmem_cache_free(reference_cache, ref); - avl_destroy(&rc->rc_tree); + } + list_destroy(&rc->rc_list); - while ((ref = list_remove_head(&rc->rc_removed))) + while ((ref = list_head(&rc->rc_removed))) { + list_remove(&rc->rc_removed, ref); + kmem_cache_free(reference_history_cache, ref->ref_removed); kmem_cache_free(reference_cache, ref); + } list_destroy(&rc->rc_removed); mutex_destroy(&rc->rc_mtx); } @@ -127,10 +124,10 @@ int64_t zfs_refcount_add_many(zfs_refcount_t *rc, uint64_t number, const void *holder) { - reference_t *ref; + reference_t *ref = NULL; int64_t count; - if (likely(!rc->rc_tracked)) { + if (!rc->rc_tracked) { count = atomic_add_64_nv(&(rc)->rc_count, number); ASSERT3U(count, >=, number); return (count); @@ -139,9 +136,8 @@ ref = kmem_cache_alloc(reference_cache, KM_SLEEP); ref->ref_holder = holder; ref->ref_number = number; - ref->ref_search = B_FALSE; mutex_enter(&rc->rc_mtx); - avl_add(&rc->rc_tree, ref); + list_insert_head(&rc->rc_list, ref); rc->rc_count += number; count = rc->rc_count; mutex_exit(&rc->rc_mtx); @@ -155,55 +151,51 @@ return (zfs_refcount_add_many(rc, 1, holder)); } -void -zfs_refcount_add_few(zfs_refcount_t *rc, uint64_t number, const void *holder) -{ - if (likely(!rc->rc_tracked)) - (void) zfs_refcount_add_many(rc, number, holder); - else for (; number > 0; number--) - (void) zfs_refcount_add(rc, holder); -} - int64_t zfs_refcount_remove_many(zfs_refcount_t *rc, uint64_t number, const void *holder) { - reference_t *ref, s; + reference_t *ref; int64_t count; - if (likely(!rc->rc_tracked)) { + if (!rc->rc_tracked) { count = atomic_add_64_nv(&(rc)->rc_count, -number); ASSERT3S(count, >=, 0); return (count); } - s.ref_holder = holder; - s.ref_number = number; - s.ref_search = B_TRUE; mutex_enter(&rc->rc_mtx); ASSERT3U(rc->rc_count, >=, number); - ref = avl_find(&rc->rc_tree, &s, NULL); - if (unlikely(ref == NULL)) { - panic("No such hold %p on refcount %llx", holder, - (u_longlong_t)(uintptr_t)rc); - return (-1); - } - avl_remove(&rc->rc_tree, ref); - if (reference_history > 0) { - list_insert_head(&rc->rc_removed, ref); - if (rc->rc_removed_count >= reference_history) { - ref = list_remove_tail(&rc->rc_removed); - kmem_cache_free(reference_cache, ref); - } else { - rc->rc_removed_count++; + for (ref = list_head(&rc->rc_list); ref; + ref = list_next(&rc->rc_list, ref)) { + if (ref->ref_holder == holder && ref->ref_number == number) { + list_remove(&rc->rc_list, ref); + if (reference_history > 0) { + ref->ref_removed = + kmem_cache_alloc(reference_history_cache, + KM_SLEEP); + list_insert_head(&rc->rc_removed, ref); + rc->rc_removed_count++; + if (rc->rc_removed_count > reference_history) { + ref = list_tail(&rc->rc_removed); + list_remove(&rc->rc_removed, ref); + kmem_cache_free(reference_history_cache, + ref->ref_removed); + kmem_cache_free(reference_cache, ref); + rc->rc_removed_count--; + } + } else { + kmem_cache_free(reference_cache, ref); + } + rc->rc_count -= number; + count = rc->rc_count; + mutex_exit(&rc->rc_mtx); + return (count); } - } else { - kmem_cache_free(reference_cache, ref); } - rc->rc_count -= number; - count = rc->rc_count; - mutex_exit(&rc->rc_mtx); - return (count); + panic("No such hold %p on refcount %llx", holder, + (u_longlong_t)(uintptr_t)rc); + return (-1); } int64_t @@ -212,50 +204,34 @@ return (zfs_refcount_remove_many(rc, 1, holder)); } -void -zfs_refcount_remove_few(zfs_refcount_t *rc, uint64_t number, const void *holder) -{ - if (likely(!rc->rc_tracked)) - (void) zfs_refcount_remove_many(rc, number, holder); - else for (; number > 0; number--) - (void) zfs_refcount_remove(rc, holder); -} - void zfs_refcount_transfer(zfs_refcount_t *dst, zfs_refcount_t *src) { - avl_tree_t tree; - list_t removed; - reference_t *ref; - void *cookie = NULL; - uint64_t count; - uint_t removed_count; + int64_t count, removed_count; + list_t list, removed; - avl_create(&tree, zfs_refcount_compare, sizeof (reference_t), - offsetof(reference_t, ref_link.a)); + list_create(&list, sizeof (reference_t), + offsetof(reference_t, ref_link)); list_create(&removed, sizeof (reference_t), - offsetof(reference_t, ref_link.l)); + offsetof(reference_t, ref_link)); mutex_enter(&src->rc_mtx); count = src->rc_count; removed_count = src->rc_removed_count; src->rc_count = 0; src->rc_removed_count = 0; - avl_swap(&tree, &src->rc_tree); + list_move_tail(&list, &src->rc_list); list_move_tail(&removed, &src->rc_removed); mutex_exit(&src->rc_mtx); mutex_enter(&dst->rc_mtx); dst->rc_count += count; dst->rc_removed_count += removed_count; - if (avl_is_empty(&dst->rc_tree)) - avl_swap(&dst->rc_tree, &tree); - else while ((ref = avl_destroy_nodes(&tree, &cookie)) != NULL) - avl_add(&dst->rc_tree, ref); + list_move_tail(&dst->rc_list, &list); list_move_tail(&dst->rc_removed, &removed); mutex_exit(&dst->rc_mtx); - avl_destroy(&tree); + list_destroy(&list); list_destroy(&removed); } @@ -263,19 +239,23 @@ zfs_refcount_transfer_ownership_many(zfs_refcount_t *rc, uint64_t number, const void *current_holder, const void *new_holder) { - reference_t *ref, s; + reference_t *ref; + boolean_t found = B_FALSE; - if (likely(!rc->rc_tracked)) + if (!rc->rc_tracked) return; - s.ref_holder = current_holder; - s.ref_number = number; - s.ref_search = B_TRUE; mutex_enter(&rc->rc_mtx); - ref = avl_find(&rc->rc_tree, &s, NULL); - ASSERT(ref); - ref->ref_holder = new_holder; - avl_update(&rc->rc_tree, ref); + for (ref = list_head(&rc->rc_list); ref; + ref = list_next(&rc->rc_list, ref)) { + if (ref->ref_holder == current_holder && + ref->ref_number == number) { + ref->ref_holder = new_holder; + found = B_TRUE; + break; + } + } + ASSERT(found); mutex_exit(&rc->rc_mtx); } @@ -295,23 +275,21 @@ boolean_t zfs_refcount_held(zfs_refcount_t *rc, const void *holder) { - reference_t *ref, s; - avl_index_t idx; - boolean_t res; + reference_t *ref; - if (likely(!rc->rc_tracked)) + if (!rc->rc_tracked) return (zfs_refcount_count(rc) > 0); - s.ref_holder = holder; - s.ref_number = 0; - s.ref_search = B_TRUE; mutex_enter(&rc->rc_mtx); - ref = avl_find(&rc->rc_tree, &s, &idx); - if (likely(ref == NULL)) - ref = avl_nearest(&rc->rc_tree, idx, AVL_AFTER); - res = ref && ref->ref_holder == holder; + for (ref = list_head(&rc->rc_list); ref; + ref = list_next(&rc->rc_list, ref)) { + if (ref->ref_holder == holder) { + mutex_exit(&rc->rc_mtx); + return (B_TRUE); + } + } mutex_exit(&rc->rc_mtx); - return (res); + return (B_FALSE); } /* @@ -322,23 +300,21 @@ boolean_t zfs_refcount_not_held(zfs_refcount_t *rc, const void *holder) { - reference_t *ref, s; - avl_index_t idx; - boolean_t res; + reference_t *ref; - if (likely(!rc->rc_tracked)) + if (!rc->rc_tracked) return (B_TRUE); mutex_enter(&rc->rc_mtx); - s.ref_holder = holder; - s.ref_number = 0; - s.ref_search = B_TRUE; - ref = avl_find(&rc->rc_tree, &s, &idx); - if (likely(ref == NULL)) - ref = avl_nearest(&rc->rc_tree, idx, AVL_AFTER); - res = ref == NULL || ref->ref_holder != holder; + for (ref = list_head(&rc->rc_list); ref; + ref = list_next(&rc->rc_list, ref)) { + if (ref->ref_holder == holder) { + mutex_exit(&rc->rc_mtx); + return (B_FALSE); + } + } mutex_exit(&rc->rc_mtx); - return (res); + return (B_TRUE); } EXPORT_SYMBOL(zfs_refcount_create); diff --git a/sys/contrib/openzfs/module/zfs/spa.c b/sys/contrib/openzfs/module/zfs/spa.c --- a/sys/contrib/openzfs/module/zfs/spa.c +++ b/sys/contrib/openzfs/module/zfs/spa.c @@ -33,7 +33,6 @@ * Copyright 2017 Joyent, Inc. * Copyright (c) 2017, Intel Corporation. * Copyright (c) 2021, Colm Buckley - * Copyright (c) 2023 Hewlett Packard Enterprise Development LP. */ /* @@ -1609,16 +1608,16 @@ { void *cookie = NULL; spa_log_sm_t *sls; - log_summary_entry_t *e; - while ((sls = avl_destroy_nodes(&spa->spa_sm_logs_by_txg, &cookie)) != NULL) { VERIFY0(sls->sls_mscount); kmem_free(sls, sizeof (spa_log_sm_t)); } - while ((e = list_remove_head(&spa->spa_log_summary)) != NULL) { + for (log_summary_entry_t *e = list_head(&spa->spa_log_summary); + e != NULL; e = list_head(&spa->spa_log_summary)) { VERIFY0(e->lse_mscount); + list_remove(&spa->spa_log_summary, e); kmem_free(e, sizeof (log_summary_entry_t)); } @@ -6875,11 +6874,9 @@ if (!spa_feature_is_enabled(spa, SPA_FEATURE_DEVICE_REBUILD)) return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); - if (dsl_scan_resilvering(spa_get_dsl(spa)) || - dsl_scan_resilver_scheduled(spa_get_dsl(spa))) { + if (dsl_scan_resilvering(spa_get_dsl(spa))) return (spa_vdev_exit(spa, NULL, txg, ZFS_ERR_RESILVER_IN_PROGRESS)); - } } else { if (vdev_rebuild_active(rvd)) return (spa_vdev_exit(spa, NULL, txg, diff --git a/sys/contrib/openzfs/module/zfs/spa_misc.c b/sys/contrib/openzfs/module/zfs/spa_misc.c --- a/sys/contrib/openzfs/module/zfs/spa_misc.c +++ b/sys/contrib/openzfs/module/zfs/spa_misc.c @@ -730,7 +730,7 @@ mutex_init(&spa->spa_allocs[i].spaa_lock, NULL, MUTEX_DEFAULT, NULL); avl_create(&spa->spa_allocs[i].spaa_tree, zio_bookmark_compare, - sizeof (zio_t), offsetof(zio_t, io_queue_node.a)); + sizeof (zio_t), offsetof(zio_t, io_alloc_node)); } avl_create(&spa->spa_metaslabs_by_flushed, metaslab_sort_by_flushed, sizeof (metaslab_t), offsetof(metaslab_t, ms_spa_txg_node)); @@ -814,7 +814,8 @@ if (spa->spa_root) spa_strfree(spa->spa_root); - while ((dp = list_remove_head(&spa->spa_config_list)) != NULL) { + while ((dp = list_head(&spa->spa_config_list)) != NULL) { + list_remove(&spa->spa_config_list, dp); if (dp->scd_path != NULL) spa_strfree(dp->scd_path); kmem_free(dp, sizeof (spa_config_dirent_t)); @@ -2438,6 +2439,7 @@ zio_init(); dmu_init(); zil_init(); + vdev_cache_stat_init(); vdev_mirror_stat_init(); vdev_raidz_math_init(); vdev_file_init(); @@ -2461,6 +2463,7 @@ spa_evict_all(); vdev_file_fini(); + vdev_cache_stat_fini(); vdev_mirror_stat_fini(); vdev_raidz_math_fini(); chksum_fini(); @@ -2611,7 +2614,7 @@ ps->pss_end_time = scn->scn_phys.scn_end_time; ps->pss_to_examine = scn->scn_phys.scn_to_examine; ps->pss_examined = scn->scn_phys.scn_examined; - ps->pss_skipped = scn->scn_phys.scn_skipped; + ps->pss_to_process = scn->scn_phys.scn_to_process; ps->pss_processed = scn->scn_phys.scn_processed; ps->pss_errors = scn->scn_phys.scn_errors; diff --git a/sys/contrib/openzfs/module/zfs/txg.c b/sys/contrib/openzfs/module/zfs/txg.c --- a/sys/contrib/openzfs/module/zfs/txg.c +++ b/sys/contrib/openzfs/module/zfs/txg.c @@ -895,10 +895,15 @@ boolean_t txg_all_lists_empty(txg_list_t *tl) { - boolean_t res = B_TRUE; - for (int i = 0; i < TXG_SIZE; i++) - res &= (tl->tl_head[i] == NULL); - return (res); + mutex_enter(&tl->tl_lock); + for (int i = 0; i < TXG_SIZE; i++) { + if (!txg_list_empty_impl(tl, i)) { + mutex_exit(&tl->tl_lock); + return (B_FALSE); + } + } + mutex_exit(&tl->tl_lock); + return (B_TRUE); } /* diff --git a/sys/contrib/openzfs/module/zfs/vdev.c b/sys/contrib/openzfs/module/zfs/vdev.c --- a/sys/contrib/openzfs/module/zfs/vdev.c +++ b/sys/contrib/openzfs/module/zfs/vdev.c @@ -29,7 +29,7 @@ * Copyright (c) 2017, Intel Corporation. * Copyright (c) 2019, Datto Inc. All rights reserved. * Copyright (c) 2021, Klara Inc. - * Copyright (c) 2021, 2023 Hewlett Packard Enterprise Development LP. + * Copyright [2021] Hewlett Packard Enterprise Development LP */ #include @@ -715,6 +715,7 @@ offsetof(struct vdev, vdev_dtl_node)); vd->vdev_stat.vs_timestamp = gethrtime(); vdev_queue_init(vd); + vdev_cache_init(vd); return (vd); } @@ -1095,6 +1096,7 @@ * Clean up vdev structure. */ vdev_queue_fini(vd); + vdev_cache_fini(vd); if (vd->vdev_path) spa_strfree(vd->vdev_path); @@ -1718,7 +1720,8 @@ vps = kmem_zalloc(sizeof (*vps), KM_SLEEP); vps->vps_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_PROBE | - ZIO_FLAG_DONT_AGGREGATE | ZIO_FLAG_TRYHARD; + ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_AGGREGATE | + ZIO_FLAG_TRYHARD; if (spa_config_held(spa, SCL_ZIO, RW_WRITER)) { /* @@ -2609,6 +2612,8 @@ vd->vdev_ops->vdev_op_close(vd); + vdev_cache_purge(vd); + /* * We record the previous state before we close it, so that if we are * doing a reopen(), we don't generate FMA ereports if we notice that @@ -2694,17 +2699,6 @@ (void) vdev_validate(vd); } - /* - * Recheck if resilver is still needed and cancel any - * scheduled resilver if resilver is unneeded. - */ - if (!vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL) && - spa->spa_async_tasks & SPA_ASYNC_RESILVER) { - mutex_enter(&spa->spa_async_lock); - spa->spa_async_tasks &= ~SPA_ASYNC_RESILVER; - mutex_exit(&spa->spa_async_lock); - } - /* * Reassess parent vdev's health. */ @@ -4608,9 +4602,11 @@ memcpy(vsx, &vd->vdev_stat_ex, sizeof (vd->vdev_stat_ex)); - for (t = 0; t < ZIO_PRIORITY_NUM_QUEUEABLE; t++) { - vsx->vsx_active_queue[t] = vd->vdev_queue.vq_cactive[t]; - vsx->vsx_pend_queue[t] = vdev_queue_class_length(vd, t); + for (t = 0; t < ARRAY_SIZE(vd->vdev_queue.vq_class); t++) { + vsx->vsx_active_queue[t] = + vd->vdev_queue.vq_class[t].vqc_active; + vsx->vsx_pend_queue[t] = avl_numnodes( + &vd->vdev_queue.vq_class[t].vqc_queued_tree); } } } @@ -5468,20 +5464,20 @@ vdev_queue_t *vq = &vd->vdev_queue; mutex_enter(&vq->vq_lock); - if (vq->vq_active > 0) { + if (avl_numnodes(&vq->vq_active_tree) > 0) { spa_t *spa = vd->vdev_spa; zio_t *fio; uint64_t delta; - zfs_dbgmsg("slow vdev: %s has %u active IOs", - vd->vdev_path, vq->vq_active); + zfs_dbgmsg("slow vdev: %s has %lu active IOs", + vd->vdev_path, avl_numnodes(&vq->vq_active_tree)); /* * Look at the head of all the pending queues, * if any I/O has been outstanding for longer than * the spa_deadman_synctime invoke the deadman logic. */ - fio = list_head(&vq->vq_active_list); + fio = avl_first(&vq->vq_active_tree); delta = gethrtime() - fio->io_timestamp; if (delta > spa_deadman_synctime(spa)) zio_deadman(fio, tag); diff --git a/sys/contrib/openzfs/module/zfs/vdev_cache.c b/sys/contrib/openzfs/module/zfs/vdev_cache.c new file mode 100644 --- /dev/null +++ b/sys/contrib/openzfs/module/zfs/vdev_cache.c @@ -0,0 +1,436 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or https://opensource.org/licenses/CDDL-1.0. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ +/* + * Copyright (c) 2013, 2016 by Delphix. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include + +/* + * Virtual device read-ahead caching. + * + * This file implements a simple LRU read-ahead cache. When the DMU reads + * a given block, it will often want other, nearby blocks soon thereafter. + * We take advantage of this by reading a larger disk region and caching + * the result. In the best case, this can turn 128 back-to-back 512-byte + * reads into a single 64k read followed by 127 cache hits; this reduces + * latency dramatically. In the worst case, it can turn an isolated 512-byte + * read into a 64k read, which doesn't affect latency all that much but is + * terribly wasteful of bandwidth. A more intelligent version of the cache + * could keep track of access patterns and not do read-ahead unless it sees + * at least two temporally close I/Os to the same region. Currently, only + * metadata I/O is inflated. A further enhancement could take advantage of + * more semantic information about the I/O. And it could use something + * faster than an AVL tree; that was chosen solely for convenience. + * + * There are five cache operations: allocate, fill, read, write, evict. + * + * (1) Allocate. This reserves a cache entry for the specified region. + * We separate the allocate and fill operations so that multiple threads + * don't generate I/O for the same cache miss. + * + * (2) Fill. When the I/O for a cache miss completes, the fill routine + * places the data in the previously allocated cache entry. + * + * (3) Read. Read data from the cache. + * + * (4) Write. Update cache contents after write completion. + * + * (5) Evict. When allocating a new entry, we evict the oldest (LRU) entry + * if the total cache size exceeds zfs_vdev_cache_size. + */ + +/* + * These tunables are for performance analysis. + */ +/* + * All i/os smaller than zfs_vdev_cache_max will be turned into + * 1<ve_offset, ve2->ve_offset)); +} + +static int +vdev_cache_lastused_compare(const void *a1, const void *a2) +{ + const vdev_cache_entry_t *ve1 = (const vdev_cache_entry_t *)a1; + const vdev_cache_entry_t *ve2 = (const vdev_cache_entry_t *)a2; + + int cmp = TREE_CMP(ve1->ve_lastused, ve2->ve_lastused); + if (likely(cmp)) + return (cmp); + + /* + * Among equally old entries, sort by offset to ensure uniqueness. + */ + return (vdev_cache_offset_compare(a1, a2)); +} + +/* + * Evict the specified entry from the cache. + */ +static void +vdev_cache_evict(vdev_cache_t *vc, vdev_cache_entry_t *ve) +{ + ASSERT(MUTEX_HELD(&vc->vc_lock)); + ASSERT3P(ve->ve_fill_io, ==, NULL); + ASSERT3P(ve->ve_abd, !=, NULL); + + avl_remove(&vc->vc_lastused_tree, ve); + avl_remove(&vc->vc_offset_tree, ve); + abd_free(ve->ve_abd); + kmem_free(ve, sizeof (vdev_cache_entry_t)); +} + +/* + * Allocate an entry in the cache. At the point we don't have the data, + * we're just creating a placeholder so that multiple threads don't all + * go off and read the same blocks. + */ +static vdev_cache_entry_t * +vdev_cache_allocate(zio_t *zio) +{ + vdev_cache_t *vc = &zio->io_vd->vdev_cache; + uint64_t offset = P2ALIGN(zio->io_offset, VCBS); + vdev_cache_entry_t *ve; + + ASSERT(MUTEX_HELD(&vc->vc_lock)); + + if (zfs_vdev_cache_size == 0) + return (NULL); + + /* + * If adding a new entry would exceed the cache size, + * evict the oldest entry (LRU). + */ + if ((avl_numnodes(&vc->vc_lastused_tree) << zfs_vdev_cache_bshift) > + zfs_vdev_cache_size) { + ve = avl_first(&vc->vc_lastused_tree); + if (ve->ve_fill_io != NULL) + return (NULL); + ASSERT3U(ve->ve_hits, !=, 0); + vdev_cache_evict(vc, ve); + } + + ve = kmem_zalloc(sizeof (vdev_cache_entry_t), KM_SLEEP); + ve->ve_offset = offset; + ve->ve_lastused = ddi_get_lbolt(); + ve->ve_abd = abd_alloc_for_io(VCBS, B_TRUE); + + avl_add(&vc->vc_offset_tree, ve); + avl_add(&vc->vc_lastused_tree, ve); + + return (ve); +} + +static void +vdev_cache_hit(vdev_cache_t *vc, vdev_cache_entry_t *ve, zio_t *zio) +{ + uint64_t cache_phase = P2PHASE(zio->io_offset, VCBS); + + ASSERT(MUTEX_HELD(&vc->vc_lock)); + ASSERT3P(ve->ve_fill_io, ==, NULL); + + if (ve->ve_lastused != ddi_get_lbolt()) { + avl_remove(&vc->vc_lastused_tree, ve); + ve->ve_lastused = ddi_get_lbolt(); + avl_add(&vc->vc_lastused_tree, ve); + } + + ve->ve_hits++; + abd_copy_off(zio->io_abd, ve->ve_abd, 0, cache_phase, zio->io_size); +} + +/* + * Fill a previously allocated cache entry with data. + */ +static void +vdev_cache_fill(zio_t *fio) +{ + vdev_t *vd = fio->io_vd; + vdev_cache_t *vc = &vd->vdev_cache; + vdev_cache_entry_t *ve = fio->io_private; + zio_t *pio; + + ASSERT3U(fio->io_size, ==, VCBS); + + /* + * Add data to the cache. + */ + mutex_enter(&vc->vc_lock); + + ASSERT3P(ve->ve_fill_io, ==, fio); + ASSERT3U(ve->ve_offset, ==, fio->io_offset); + ASSERT3P(ve->ve_abd, ==, fio->io_abd); + + ve->ve_fill_io = NULL; + + /* + * Even if this cache line was invalidated by a missed write update, + * any reads that were queued up before the missed update are still + * valid, so we can satisfy them from this line before we evict it. + */ + zio_link_t *zl = NULL; + while ((pio = zio_walk_parents(fio, &zl)) != NULL) + vdev_cache_hit(vc, ve, pio); + + if (fio->io_error || ve->ve_missed_update) + vdev_cache_evict(vc, ve); + + mutex_exit(&vc->vc_lock); +} + +/* + * Read data from the cache. Returns B_TRUE cache hit, B_FALSE on miss. + */ +boolean_t +vdev_cache_read(zio_t *zio) +{ + vdev_cache_t *vc = &zio->io_vd->vdev_cache; + vdev_cache_entry_t *ve, ve_search; + uint64_t cache_offset = P2ALIGN(zio->io_offset, VCBS); + zio_t *fio; + uint64_t cache_phase __maybe_unused = P2PHASE(zio->io_offset, VCBS); + + ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ); + + if (zfs_vdev_cache_size == 0) + return (B_FALSE); + + if (zio->io_flags & ZIO_FLAG_DONT_CACHE) + return (B_FALSE); + + if (zio->io_size > zfs_vdev_cache_max) + return (B_FALSE); + + /* + * If the I/O straddles two or more cache blocks, don't cache it. + */ + if (P2BOUNDARY(zio->io_offset, zio->io_size, VCBS)) + return (B_FALSE); + + ASSERT3U(cache_phase + zio->io_size, <=, VCBS); + + mutex_enter(&vc->vc_lock); + + ve_search.ve_offset = cache_offset; + ve = avl_find(&vc->vc_offset_tree, &ve_search, NULL); + + if (ve != NULL) { + if (ve->ve_missed_update) { + mutex_exit(&vc->vc_lock); + return (B_FALSE); + } + + if ((fio = ve->ve_fill_io) != NULL) { + zio_vdev_io_bypass(zio); + zio_add_child(zio, fio); + mutex_exit(&vc->vc_lock); + VDCSTAT_BUMP(vdc_stat_delegations); + return (B_TRUE); + } + + vdev_cache_hit(vc, ve, zio); + zio_vdev_io_bypass(zio); + + mutex_exit(&vc->vc_lock); + VDCSTAT_BUMP(vdc_stat_hits); + return (B_TRUE); + } + + ve = vdev_cache_allocate(zio); + + if (ve == NULL) { + mutex_exit(&vc->vc_lock); + return (B_FALSE); + } + + fio = zio_vdev_delegated_io(zio->io_vd, cache_offset, + ve->ve_abd, VCBS, ZIO_TYPE_READ, ZIO_PRIORITY_NOW, + ZIO_FLAG_DONT_CACHE, vdev_cache_fill, ve); + + ve->ve_fill_io = fio; + zio_vdev_io_bypass(zio); + zio_add_child(zio, fio); + + mutex_exit(&vc->vc_lock); + zio_nowait(fio); + VDCSTAT_BUMP(vdc_stat_misses); + + return (B_TRUE); +} + +/* + * Update cache contents upon write completion. + */ +void +vdev_cache_write(zio_t *zio) +{ + vdev_cache_t *vc = &zio->io_vd->vdev_cache; + vdev_cache_entry_t *ve, ve_search; + uint64_t io_start = zio->io_offset; + uint64_t io_end = io_start + zio->io_size; + uint64_t min_offset = P2ALIGN(io_start, VCBS); + uint64_t max_offset = P2ROUNDUP(io_end, VCBS); + avl_index_t where; + + ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE); + + mutex_enter(&vc->vc_lock); + + ve_search.ve_offset = min_offset; + ve = avl_find(&vc->vc_offset_tree, &ve_search, &where); + + if (ve == NULL) + ve = avl_nearest(&vc->vc_offset_tree, where, AVL_AFTER); + + while (ve != NULL && ve->ve_offset < max_offset) { + uint64_t start = MAX(ve->ve_offset, io_start); + uint64_t end = MIN(ve->ve_offset + VCBS, io_end); + + if (ve->ve_fill_io != NULL) { + ve->ve_missed_update = 1; + } else { + abd_copy_off(ve->ve_abd, zio->io_abd, + start - ve->ve_offset, start - io_start, + end - start); + } + ve = AVL_NEXT(&vc->vc_offset_tree, ve); + } + mutex_exit(&vc->vc_lock); +} + +void +vdev_cache_purge(vdev_t *vd) +{ + vdev_cache_t *vc = &vd->vdev_cache; + vdev_cache_entry_t *ve; + + mutex_enter(&vc->vc_lock); + while ((ve = avl_first(&vc->vc_offset_tree)) != NULL) + vdev_cache_evict(vc, ve); + mutex_exit(&vc->vc_lock); +} + +void +vdev_cache_init(vdev_t *vd) +{ + vdev_cache_t *vc = &vd->vdev_cache; + + mutex_init(&vc->vc_lock, NULL, MUTEX_DEFAULT, NULL); + + avl_create(&vc->vc_offset_tree, vdev_cache_offset_compare, + sizeof (vdev_cache_entry_t), + offsetof(struct vdev_cache_entry, ve_offset_node)); + + avl_create(&vc->vc_lastused_tree, vdev_cache_lastused_compare, + sizeof (vdev_cache_entry_t), + offsetof(struct vdev_cache_entry, ve_lastused_node)); +} + +void +vdev_cache_fini(vdev_t *vd) +{ + vdev_cache_t *vc = &vd->vdev_cache; + + vdev_cache_purge(vd); + + avl_destroy(&vc->vc_offset_tree); + avl_destroy(&vc->vc_lastused_tree); + + mutex_destroy(&vc->vc_lock); +} + +void +vdev_cache_stat_init(void) +{ + vdc_ksp = kstat_create("zfs", 0, "vdev_cache_stats", "misc", + KSTAT_TYPE_NAMED, sizeof (vdc_stats) / sizeof (kstat_named_t), + KSTAT_FLAG_VIRTUAL); + if (vdc_ksp != NULL) { + vdc_ksp->ks_data = &vdc_stats; + kstat_install(vdc_ksp); + } +} + +void +vdev_cache_stat_fini(void) +{ + if (vdc_ksp != NULL) { + kstat_delete(vdc_ksp); + vdc_ksp = NULL; + } +} + +ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, cache_max, UINT, ZMOD_RW, + "Inflate reads small than max"); + +ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, cache_size, UINT, ZMOD_RD, + "Total size of the per-disk cache"); + +ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, cache_bshift, UINT, ZMOD_RW, + "Shift size to inflate reads too"); diff --git a/sys/contrib/openzfs/module/zfs/vdev_indirect.c b/sys/contrib/openzfs/module/zfs/vdev_indirect.c --- a/sys/contrib/openzfs/module/zfs/vdev_indirect.c +++ b/sys/contrib/openzfs/module/zfs/vdev_indirect.c @@ -293,16 +293,17 @@ indirect_vsd_t *iv = zio->io_vsd; indirect_split_t *is; - while ((is = list_remove_head(&iv->iv_splits)) != NULL) { + while ((is = list_head(&iv->iv_splits)) != NULL) { for (int c = 0; c < is->is_children; c++) { indirect_child_t *ic = &is->is_child[c]; if (ic->ic_data != NULL) abd_free(ic->ic_data); } + list_remove(&iv->iv_splits, is); indirect_child_t *ic; - while ((ic = list_remove_head(&is->is_unique_child)) != NULL) - ; + while ((ic = list_head(&is->is_unique_child)) != NULL) + list_remove(&is->is_unique_child, ic); list_destroy(&is->is_unique_child); @@ -1658,8 +1659,8 @@ for (indirect_split_t *is = list_head(&iv->iv_splits); is != NULL; is = list_next(&iv->iv_splits, is)) { indirect_child_t *ic; - while ((ic = list_remove_head(&is->is_unique_child)) != NULL) - ; + while ((ic = list_head(&is->is_unique_child)) != NULL) + list_remove(&is->is_unique_child, ic); is->is_unique_children = 0; } diff --git a/sys/contrib/openzfs/module/zfs/vdev_label.c b/sys/contrib/openzfs/module/zfs/vdev_label.c --- a/sys/contrib/openzfs/module/zfs/vdev_label.c +++ b/sys/contrib/openzfs/module/zfs/vdev_label.c @@ -486,9 +486,6 @@ if (vd->vdev_isspare) fnvlist_add_uint64(nv, ZPOOL_CONFIG_IS_SPARE, 1); - if (flags & VDEV_CONFIG_L2CACHE) - fnvlist_add_uint64(nv, ZPOOL_CONFIG_ASHIFT, vd->vdev_ashift); - if (!(flags & (VDEV_CONFIG_SPARE | VDEV_CONFIG_L2CACHE)) && vd == vd->vdev_top) { fnvlist_add_uint64(nv, ZPOOL_CONFIG_METASLAB_ARRAY, diff --git a/sys/contrib/openzfs/module/zfs/vdev_queue.c b/sys/contrib/openzfs/module/zfs/vdev_queue.c --- a/sys/contrib/openzfs/module/zfs/vdev_queue.c +++ b/sys/contrib/openzfs/module/zfs/vdev_queue.c @@ -228,6 +228,13 @@ */ uint_t zfs_vdev_def_queue_depth = 32; +/* + * Allow TRIM I/Os to be aggregated. This should normally not be needed since + * TRIM I/O for extents up to zfs_trim_extent_bytes_max (128M) can be submitted + * by the TRIM code in zfs_trim.c. + */ +static uint_t zfs_vdev_aggregate_trim = 0; + static int vdev_queue_offset_compare(const void *x1, const void *x2) { @@ -242,60 +249,38 @@ return (TREE_PCMP(z1, z2)); } -#define VDQ_T_SHIFT 29 +static inline avl_tree_t * +vdev_queue_class_tree(vdev_queue_t *vq, zio_priority_t p) +{ + return (&vq->vq_class[p].vqc_queued_tree); +} + +static inline avl_tree_t * +vdev_queue_type_tree(vdev_queue_t *vq, zio_type_t t) +{ + ASSERT(t == ZIO_TYPE_READ || t == ZIO_TYPE_WRITE || t == ZIO_TYPE_TRIM); + if (t == ZIO_TYPE_READ) + return (&vq->vq_read_offset_tree); + else if (t == ZIO_TYPE_WRITE) + return (&vq->vq_write_offset_tree); + else + return (&vq->vq_trim_offset_tree); +} static int -vdev_queue_to_compare(const void *x1, const void *x2) +vdev_queue_timestamp_compare(const void *x1, const void *x2) { const zio_t *z1 = (const zio_t *)x1; const zio_t *z2 = (const zio_t *)x2; - int tcmp = TREE_CMP(z1->io_timestamp >> VDQ_T_SHIFT, - z2->io_timestamp >> VDQ_T_SHIFT); - int ocmp = TREE_CMP(z1->io_offset, z2->io_offset); - int cmp = tcmp ? tcmp : ocmp; + int cmp = TREE_CMP(z1->io_timestamp, z2->io_timestamp); - if (likely(cmp | (z1->io_queue_state == ZIO_QS_NONE))) + if (likely(cmp)) return (cmp); return (TREE_PCMP(z1, z2)); } -static inline boolean_t -vdev_queue_class_fifo(zio_priority_t p) -{ - return (p == ZIO_PRIORITY_SYNC_READ || p == ZIO_PRIORITY_SYNC_WRITE || - p == ZIO_PRIORITY_TRIM); -} - -static void -vdev_queue_class_add(vdev_queue_t *vq, zio_t *zio) -{ - zio_priority_t p = zio->io_priority; - vq->vq_cqueued |= 1U << p; - if (vdev_queue_class_fifo(p)) - list_insert_tail(&vq->vq_class[p].vqc_list, zio); - else - avl_add(&vq->vq_class[p].vqc_tree, zio); -} - -static void -vdev_queue_class_remove(vdev_queue_t *vq, zio_t *zio) -{ - zio_priority_t p = zio->io_priority; - uint32_t empty; - if (vdev_queue_class_fifo(p)) { - list_t *list = &vq->vq_class[p].vqc_list; - list_remove(list, zio); - empty = list_is_empty(list); - } else { - avl_tree_t *tree = &vq->vq_class[p].vqc_tree; - avl_remove(tree, zio); - empty = avl_is_empty(tree); - } - vq->vq_cqueued &= ~(empty << p); -} - static uint_t vdev_queue_class_min_active(vdev_queue_t *vq, zio_priority_t p) { @@ -375,7 +360,7 @@ } static uint_t -vdev_queue_class_max_active(vdev_queue_t *vq, zio_priority_t p) +vdev_queue_class_max_active(spa_t *spa, vdev_queue_t *vq, zio_priority_t p) { switch (p) { case ZIO_PRIORITY_SYNC_READ: @@ -385,7 +370,7 @@ case ZIO_PRIORITY_ASYNC_READ: return (zfs_vdev_async_read_max_active); case ZIO_PRIORITY_ASYNC_WRITE: - return (vdev_queue_max_async_writes(vq->vq_vdev->vdev_spa)); + return (vdev_queue_max_async_writes(spa)); case ZIO_PRIORITY_SCRUB: if (vq->vq_ia_active > 0) { return (MIN(vq->vq_nia_credit, @@ -429,10 +414,10 @@ static zio_priority_t vdev_queue_class_to_issue(vdev_queue_t *vq) { - uint32_t cq = vq->vq_cqueued; - zio_priority_t p, p1; + spa_t *spa = vq->vq_vdev->vdev_spa; + zio_priority_t p, n; - if (cq == 0 || vq->vq_active >= zfs_vdev_max_active) + if (avl_numnodes(&vq->vq_active_tree) >= zfs_vdev_max_active) return (ZIO_PRIORITY_NUM_QUEUEABLE); /* @@ -440,18 +425,14 @@ * Do round-robin to reduce starvation due to zfs_vdev_max_active * and vq_nia_credit limits. */ - p1 = vq->vq_last_prio + 1; - if (p1 >= ZIO_PRIORITY_NUM_QUEUEABLE) - p1 = 0; - for (p = p1; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) { - if ((cq & (1U << p)) != 0 && vq->vq_cactive[p] < - vdev_queue_class_min_active(vq, p)) - goto found; - } - for (p = 0; p < p1; p++) { - if ((cq & (1U << p)) != 0 && vq->vq_cactive[p] < - vdev_queue_class_min_active(vq, p)) - goto found; + for (n = 0; n < ZIO_PRIORITY_NUM_QUEUEABLE; n++) { + p = (vq->vq_last_prio + n + 1) % ZIO_PRIORITY_NUM_QUEUEABLE; + if (avl_numnodes(vdev_queue_class_tree(vq, p)) > 0 && + vq->vq_class[p].vqc_active < + vdev_queue_class_min_active(vq, p)) { + vq->vq_last_prio = p; + return (p); + } } /* @@ -459,14 +440,16 @@ * maximum # outstanding i/os. */ for (p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) { - if ((cq & (1U << p)) != 0 && vq->vq_cactive[p] < - vdev_queue_class_max_active(vq, p)) - break; + if (avl_numnodes(vdev_queue_class_tree(vq, p)) > 0 && + vq->vq_class[p].vqc_active < + vdev_queue_class_max_active(spa, vq, p)) { + vq->vq_last_prio = p; + return (p); + } } -found: - vq->vq_last_prio = p; - return (p); + /* No eligible queued i/os */ + return (ZIO_PRIORITY_NUM_QUEUEABLE); } void @@ -475,30 +458,42 @@ vdev_queue_t *vq = &vd->vdev_queue; zio_priority_t p; + mutex_init(&vq->vq_lock, NULL, MUTEX_DEFAULT, NULL); vq->vq_vdev = vd; + taskq_init_ent(&vd->vdev_queue.vq_io_search.io_tqent); - for (p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) { - if (vdev_queue_class_fifo(p)) { - list_create(&vq->vq_class[p].vqc_list, - sizeof (zio_t), - offsetof(struct zio, io_queue_node.l)); - } else { - avl_create(&vq->vq_class[p].vqc_tree, - vdev_queue_to_compare, sizeof (zio_t), - offsetof(struct zio, io_queue_node.a)); - } - } - avl_create(&vq->vq_read_offset_tree, + avl_create(&vq->vq_active_tree, vdev_queue_offset_compare, + sizeof (zio_t), offsetof(struct zio, io_queue_node)); + avl_create(vdev_queue_type_tree(vq, ZIO_TYPE_READ), + vdev_queue_offset_compare, sizeof (zio_t), + offsetof(struct zio, io_offset_node)); + avl_create(vdev_queue_type_tree(vq, ZIO_TYPE_WRITE), vdev_queue_offset_compare, sizeof (zio_t), offsetof(struct zio, io_offset_node)); - avl_create(&vq->vq_write_offset_tree, + avl_create(vdev_queue_type_tree(vq, ZIO_TYPE_TRIM), vdev_queue_offset_compare, sizeof (zio_t), offsetof(struct zio, io_offset_node)); + for (p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) { + int (*compfn) (const void *, const void *); + + /* + * The synchronous/trim i/o queues are dispatched in FIFO rather + * than LBA order. This provides more consistent latency for + * these i/os. + */ + if (p == ZIO_PRIORITY_SYNC_READ || + p == ZIO_PRIORITY_SYNC_WRITE || + p == ZIO_PRIORITY_TRIM) { + compfn = vdev_queue_timestamp_compare; + } else { + compfn = vdev_queue_offset_compare; + } + avl_create(vdev_queue_class_tree(vq, p), compfn, + sizeof (zio_t), offsetof(struct zio, io_queue_node)); + } + vq->vq_last_offset = 0; - list_create(&vq->vq_active_list, sizeof (struct zio), - offsetof(struct zio, io_queue_node.l)); - mutex_init(&vq->vq_lock, NULL, MUTEX_DEFAULT, NULL); } void @@ -506,39 +501,30 @@ { vdev_queue_t *vq = &vd->vdev_queue; - for (zio_priority_t p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) { - if (vdev_queue_class_fifo(p)) - list_destroy(&vq->vq_class[p].vqc_list); - else - avl_destroy(&vq->vq_class[p].vqc_tree); - } - avl_destroy(&vq->vq_read_offset_tree); - avl_destroy(&vq->vq_write_offset_tree); + for (zio_priority_t p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) + avl_destroy(vdev_queue_class_tree(vq, p)); + avl_destroy(&vq->vq_active_tree); + avl_destroy(vdev_queue_type_tree(vq, ZIO_TYPE_READ)); + avl_destroy(vdev_queue_type_tree(vq, ZIO_TYPE_WRITE)); + avl_destroy(vdev_queue_type_tree(vq, ZIO_TYPE_TRIM)); - list_destroy(&vq->vq_active_list); mutex_destroy(&vq->vq_lock); } static void vdev_queue_io_add(vdev_queue_t *vq, zio_t *zio) { - zio->io_queue_state = ZIO_QS_QUEUED; - vdev_queue_class_add(vq, zio); - if (zio->io_type == ZIO_TYPE_READ) - avl_add(&vq->vq_read_offset_tree, zio); - else if (zio->io_type == ZIO_TYPE_WRITE) - avl_add(&vq->vq_write_offset_tree, zio); + ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE); + avl_add(vdev_queue_class_tree(vq, zio->io_priority), zio); + avl_add(vdev_queue_type_tree(vq, zio->io_type), zio); } static void vdev_queue_io_remove(vdev_queue_t *vq, zio_t *zio) { - vdev_queue_class_remove(vq, zio); - if (zio->io_type == ZIO_TYPE_READ) - avl_remove(&vq->vq_read_offset_tree, zio); - else if (zio->io_type == ZIO_TYPE_WRITE) - avl_remove(&vq->vq_write_offset_tree, zio); - zio->io_queue_state = ZIO_QS_NONE; + ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE); + avl_remove(vdev_queue_class_tree(vq, zio->io_priority), zio); + avl_remove(vdev_queue_type_tree(vq, zio->io_type), zio); } static boolean_t @@ -560,16 +546,14 @@ { ASSERT(MUTEX_HELD(&vq->vq_lock)); ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE); - vq->vq_cactive[zio->io_priority]++; - vq->vq_active++; + vq->vq_class[zio->io_priority].vqc_active++; if (vdev_queue_is_interactive(zio->io_priority)) { if (++vq->vq_ia_active == 1) vq->vq_nia_credit = 1; } else if (vq->vq_ia_active > 0) { vq->vq_nia_credit--; } - zio->io_queue_state = ZIO_QS_ACTIVE; - list_insert_tail(&vq->vq_active_list, zio); + avl_add(&vq->vq_active_tree, zio); } static void @@ -577,8 +561,7 @@ { ASSERT(MUTEX_HELD(&vq->vq_lock)); ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE); - vq->vq_cactive[zio->io_priority]--; - vq->vq_active--; + vq->vq_class[zio->io_priority].vqc_active--; if (vdev_queue_is_interactive(zio->io_priority)) { if (--vq->vq_ia_active == 0) vq->vq_nia_credit = 0; @@ -586,8 +569,7 @@ vq->vq_nia_credit = zfs_vdev_nia_credit; } else if (vq->vq_ia_active == 0) vq->vq_nia_credit++; - list_remove(&vq->vq_active_list, zio); - zio->io_queue_state = ZIO_QS_NONE; + avl_remove(&vq->vq_active_tree, zio); } static void @@ -620,28 +602,29 @@ uint64_t maxgap = 0; uint64_t size; uint64_t limit; + int maxblocksize; boolean_t stretch = B_FALSE; + avl_tree_t *t = vdev_queue_type_tree(vq, zio->io_type); + zio_flag_t flags = zio->io_flags & ZIO_FLAG_AGG_INHERIT; uint64_t next_offset; abd_t *abd; - avl_tree_t *t; - - /* - * TRIM aggregation should not be needed since code in zfs_trim.c can - * submit TRIM I/O for extents up to zfs_trim_extent_bytes_max (128M). - */ - if (zio->io_type == ZIO_TYPE_TRIM) - return (NULL); - - if (zio->io_flags & ZIO_FLAG_DONT_AGGREGATE) - return (NULL); + maxblocksize = spa_maxblocksize(vq->vq_vdev->vdev_spa); if (vq->vq_vdev->vdev_nonrot) limit = zfs_vdev_aggregation_limit_non_rotating; else limit = zfs_vdev_aggregation_limit; - if (limit == 0) + limit = MIN(limit, maxblocksize); + + if (zio->io_flags & ZIO_FLAG_DONT_AGGREGATE || limit == 0) + return (NULL); + + /* + * While TRIM commands could be aggregated based on offset this + * behavior is disabled until it's determined to be beneficial. + */ + if (zio->io_type == ZIO_TYPE_TRIM && !zfs_vdev_aggregate_trim) return (NULL); - limit = MIN(limit, SPA_MAXBLOCKSIZE); /* * I/Os to distributed spares are directly dispatched to the dRAID @@ -652,13 +635,8 @@ first = last = zio; - if (zio->io_type == ZIO_TYPE_READ) { + if (zio->io_type == ZIO_TYPE_READ) maxgap = zfs_vdev_read_gap_limit; - t = &vq->vq_read_offset_tree; - } else { - ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE); - t = &vq->vq_write_offset_tree; - } /* * We can aggregate I/Os that are sufficiently adjacent and of @@ -679,7 +657,6 @@ * Walk backwards through sufficiently contiguous I/Os * recording the last non-optional I/O. */ - zio_flag_t flags = zio->io_flags & ZIO_FLAG_AGG_INHERIT; while ((dio = AVL_PREV(t, first)) != NULL && (dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags && IO_SPAN(dio, last) <= limit && @@ -709,7 +686,7 @@ (dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags && (IO_SPAN(first, dio) <= limit || (dio->io_flags & ZIO_FLAG_OPTIONAL)) && - IO_SPAN(first, dio) <= SPA_MAXBLOCKSIZE && + IO_SPAN(first, dio) <= maxblocksize && IO_GAP(last, dio) <= maxgap && dio->io_type == zio->io_type) { last = dio; @@ -763,7 +740,7 @@ return (NULL); size = IO_SPAN(first, last); - ASSERT3U(size, <=, SPA_MAXBLOCKSIZE); + ASSERT3U(size, <=, maxblocksize); abd = abd_alloc_gang(); if (abd == NULL) @@ -771,7 +748,8 @@ aio = zio_vdev_delegated_io(first->io_vd, first->io_offset, abd, size, first->io_type, zio->io_priority, - flags | ZIO_FLAG_DONT_QUEUE, vdev_queue_agg_io_done, NULL); + flags | ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE, + vdev_queue_agg_io_done, NULL); aio->io_timestamp = first->io_timestamp; nio = first; @@ -847,30 +825,19 @@ return (NULL); } - if (vdev_queue_class_fifo(p)) { - zio = list_head(&vq->vq_class[p].vqc_list); - } else { - /* - * For LBA-ordered queues (async / scrub / initializing), - * issue the I/O which follows the most recently issued I/O - * in LBA (offset) order, but to avoid starvation only within - * the same 0.5 second interval as the first I/O. - */ - tree = &vq->vq_class[p].vqc_tree; - zio = aio = avl_first(tree); - if (zio->io_offset < vq->vq_last_offset) { - vq->vq_io_search.io_timestamp = zio->io_timestamp; - vq->vq_io_search.io_offset = vq->vq_last_offset; - zio = avl_find(tree, &vq->vq_io_search, &idx); - if (zio == NULL) { - zio = avl_nearest(tree, idx, AVL_AFTER); - if (zio == NULL || - (zio->io_timestamp >> VDQ_T_SHIFT) != - (aio->io_timestamp >> VDQ_T_SHIFT)) - zio = aio; - } - } - } + /* + * For LBA-ordered queues (async / scrub / initializing), issue the + * i/o which follows the most recently issued i/o in LBA (offset) order. + * + * For FIFO queues (sync/trim), issue the i/o with the lowest timestamp. + */ + tree = vdev_queue_class_tree(vq, p); + vq->vq_io_search.io_timestamp = 0; + vq->vq_io_search.io_offset = vq->vq_last_offset - 1; + VERIFY3P(avl_find(tree, &vq->vq_io_search, &idx), ==, NULL); + zio = avl_nearest(tree, idx, AVL_AFTER); + if (zio == NULL) + zio = avl_first(tree); ASSERT3U(zio->io_priority, ==, p); aio = vdev_queue_aggregate(vq, zio); @@ -940,7 +907,7 @@ ASSERT(zio->io_priority == ZIO_PRIORITY_TRIM); } - zio->io_flags |= ZIO_FLAG_DONT_QUEUE; + zio->io_flags |= ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE; zio->io_timestamp = gethrtime(); mutex_enter(&vq->vq_lock); @@ -1001,6 +968,7 @@ vdev_queue_change_io_priority(zio_t *zio, zio_priority_t priority) { vdev_queue_t *vq = &zio->io_vd->vdev_queue; + avl_tree_t *tree; /* * ZIO_PRIORITY_NOW is used by the vdev cache code and the aggregate zio @@ -1035,11 +1003,12 @@ * Otherwise, the zio is currently active and we cannot change its * priority. */ - if (zio->io_queue_state == ZIO_QS_QUEUED) { - vdev_queue_class_remove(vq, zio); + tree = vdev_queue_class_tree(vq, zio->io_priority); + if (avl_find(tree, zio, NULL) == zio) { + avl_remove(vdev_queue_class_tree(vq, zio->io_priority), zio); zio->io_priority = priority; - vdev_queue_class_add(vq, zio); - } else if (zio->io_queue_state == ZIO_QS_NONE) { + avl_add(vdev_queue_class_tree(vq, zio->io_priority), zio); + } else if (avl_find(&vq->vq_active_tree, zio, NULL) != zio) { zio->io_priority = priority; } @@ -1052,10 +1021,10 @@ * vq_lock mutex use here, instead we prefer to keep it lock free for * performance. */ -uint32_t +int vdev_queue_length(vdev_t *vd) { - return (vd->vdev_queue.vq_active); + return (avl_numnodes(&vd->vdev_queue.vq_active_tree)); } uint64_t @@ -1064,22 +1033,15 @@ return (vd->vdev_queue.vq_last_offset); } -uint64_t -vdev_queue_class_length(vdev_t *vd, zio_priority_t p) -{ - vdev_queue_t *vq = &vd->vdev_queue; - if (vdev_queue_class_fifo(p)) - return (list_is_empty(&vq->vq_class[p].vqc_list) == 0); - else - return (avl_numnodes(&vq->vq_class[p].vqc_tree)); -} - ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, aggregation_limit, UINT, ZMOD_RW, "Max vdev I/O aggregation size"); ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, aggregation_limit_non_rotating, UINT, ZMOD_RW, "Max vdev I/O aggregation size for non-rotating media"); +ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, aggregate_trim, UINT, ZMOD_RW, + "Allow TRIM I/O to be aggregated"); + ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, read_gap_limit, UINT, ZMOD_RW, "Aggregate read I/O over gap"); diff --git a/sys/contrib/openzfs/module/zfs/vdev_rebuild.c b/sys/contrib/openzfs/module/zfs/vdev_rebuild.c --- a/sys/contrib/openzfs/module/zfs/vdev_rebuild.c +++ b/sys/contrib/openzfs/module/zfs/vdev_rebuild.c @@ -571,10 +571,8 @@ vdev_rebuild_blkptr_init(&blk, vd, start, size); uint64_t psize = BP_GET_PSIZE(&blk); - if (!vdev_dtl_need_resilver(vd, &blk.blk_dva[0], psize, TXG_UNKNOWN)) { - vr->vr_pass_bytes_skipped += size; + if (!vdev_dtl_need_resilver(vd, &blk.blk_dva[0], psize, TXG_UNKNOWN)) return (0); - } mutex_enter(&vr->vr_io_lock); @@ -788,7 +786,6 @@ vr->vr_pass_start_time = gethrtime(); vr->vr_pass_bytes_scanned = 0; vr->vr_pass_bytes_issued = 0; - vr->vr_pass_bytes_skipped = 0; uint64_t update_est_time = gethrtime(); vdev_rebuild_update_bytes_est(vd, 0); @@ -1156,7 +1153,6 @@ vr->vr_pass_start_time); vrs->vrs_pass_bytes_scanned = vr->vr_pass_bytes_scanned; vrs->vrs_pass_bytes_issued = vr->vr_pass_bytes_issued; - vrs->vrs_pass_bytes_skipped = vr->vr_pass_bytes_skipped; mutex_exit(&tvd->vdev_rebuild_lock); } diff --git a/sys/contrib/openzfs/module/zfs/zap_micro.c b/sys/contrib/openzfs/module/zfs/zap_micro.c --- a/sys/contrib/openzfs/module/zfs/zap_micro.c +++ b/sys/contrib/openzfs/module/zfs/zap_micro.c @@ -285,7 +285,6 @@ } } -__attribute__((always_inline)) inline static int mze_compare(const void *arg1, const void *arg2) { @@ -296,9 +295,6 @@ (uint64_t)(mze2->mze_hash) << 32 | mze2->mze_cd)); } -ZFS_BTREE_FIND_IN_BUF_FUNC(mze_find_in_buf, mzap_ent_t, - mze_compare) - static void mze_insert(zap_t *zap, uint16_t chunkid, uint64_t hash) { @@ -465,7 +461,7 @@ * 62 entries before we have to add 2KB B-tree core node. */ zfs_btree_create_custom(&zap->zap_m.zap_tree, mze_compare, - mze_find_in_buf, sizeof (mzap_ent_t), 512); + sizeof (mzap_ent_t), 512); zap_name_t *zn = zap_name_alloc(zap); for (uint16_t i = 0; i < zap->zap_m.zap_num_chunks; i++) { diff --git a/sys/contrib/openzfs/module/zfs/zfs_fm.c b/sys/contrib/openzfs/module/zfs/zfs_fm.c --- a/sys/contrib/openzfs/module/zfs/zfs_fm.c +++ b/sys/contrib/openzfs/module/zfs/zfs_fm.c @@ -1522,8 +1522,9 @@ { recent_events_node_t *entry; - while ((entry = list_remove_head(&recent_events_list)) != NULL) { + while ((entry = list_head(&recent_events_list)) != NULL) { avl_remove(&recent_events_tree, entry); + list_remove(&recent_events_list, entry); kmem_free(entry, sizeof (*entry)); } avl_destroy(&recent_events_tree); diff --git a/sys/contrib/openzfs/module/zfs/zfs_fuid.c b/sys/contrib/openzfs/module/zfs/zfs_fuid.c --- a/sys/contrib/openzfs/module/zfs/zfs_fuid.c +++ b/sys/contrib/openzfs/module/zfs/zfs_fuid.c @@ -699,15 +699,19 @@ zfs_fuid_t *zfuid; zfs_fuid_domain_t *zdomain; - while ((zfuid = list_remove_head(&fuidp->z_fuids)) != NULL) + while ((zfuid = list_head(&fuidp->z_fuids)) != NULL) { + list_remove(&fuidp->z_fuids, zfuid); kmem_free(zfuid, sizeof (zfs_fuid_t)); + } if (fuidp->z_domain_table != NULL) kmem_free(fuidp->z_domain_table, (sizeof (char *)) * fuidp->z_domain_cnt); - while ((zdomain = list_remove_head(&fuidp->z_domains)) != NULL) + while ((zdomain = list_head(&fuidp->z_domains)) != NULL) { + list_remove(&fuidp->z_domains, zdomain); kmem_free(zdomain, sizeof (zfs_fuid_domain_t)); + } kmem_free(fuidp, sizeof (zfs_fuid_info_t)); } diff --git a/sys/contrib/openzfs/module/zfs/zfs_onexit.c b/sys/contrib/openzfs/module/zfs/zfs_onexit.c --- a/sys/contrib/openzfs/module/zfs/zfs_onexit.c +++ b/sys/contrib/openzfs/module/zfs/zfs_onexit.c @@ -87,7 +87,8 @@ zfs_onexit_action_node_t *ap; mutex_enter(&zo->zo_lock); - while ((ap = list_remove_head(&zo->zo_actions)) != NULL) { + while ((ap = list_head(&zo->zo_actions)) != NULL) { + list_remove(&zo->zo_actions, ap); mutex_exit(&zo->zo_lock); ap->za_func(ap->za_data); kmem_free(ap, sizeof (zfs_onexit_action_node_t)); diff --git a/sys/contrib/openzfs/module/zfs/zfs_vnops.c b/sys/contrib/openzfs/module/zfs/zfs_vnops.c --- a/sys/contrib/openzfs/module/zfs/zfs_vnops.c +++ b/sys/contrib/openzfs/module/zfs/zfs_vnops.c @@ -462,12 +462,14 @@ return (SET_ERROR(EINVAL)); } + const uint64_t max_blksz = zfsvfs->z_max_blksz; + /* * Pre-fault the pages to ensure slow (eg NFS) pages * don't hold up txg. + * Skip this if uio contains loaned arc_buf. */ - ssize_t pfbytes = MIN(n, DMU_MAX_ACCESS >> 1); - if (zfs_uio_prefaultpages(pfbytes, uio)) { + if (zfs_uio_prefaultpages(MIN(n, max_blksz), uio)) { zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EFAULT)); } @@ -542,31 +544,10 @@ break; } - uint64_t blksz; - if (lr->lr_length == UINT64_MAX && zp->z_size <= zp->z_blksz) { - if (zp->z_blksz > zfsvfs->z_max_blksz && - !ISP2(zp->z_blksz)) { - /* - * File's blocksize is already larger than the - * "recordsize" property. Only let it grow to - * the next power of 2. - */ - blksz = 1 << highbit64(zp->z_blksz); - } else { - blksz = zfsvfs->z_max_blksz; - } - blksz = MIN(blksz, P2ROUNDUP(end_size, - SPA_MINBLOCKSIZE)); - blksz = MAX(blksz, zp->z_blksz); - } else { - blksz = zp->z_blksz; - } - arc_buf_t *abuf = NULL; - ssize_t nbytes = n; - if (n >= blksz && woff >= zp->z_size && - P2PHASE(woff, blksz) == 0 && - (blksz >= SPA_OLD_MAXBLOCKSIZE || n < 4 * blksz)) { + if (n >= max_blksz && woff >= zp->z_size && + P2PHASE(woff, max_blksz) == 0 && + zp->z_blksz == max_blksz) { /* * This write covers a full block. "Borrow" a buffer * from the dmu so that we can fill it before we enter @@ -574,26 +555,18 @@ * holding up the transaction if the data copy hangs * up on a pagefault (e.g., from an NFS server mapping). */ + size_t cbytes; + abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl), - blksz); + max_blksz); ASSERT(abuf != NULL); - ASSERT(arc_buf_size(abuf) == blksz); - if ((error = zfs_uiocopy(abuf->b_data, blksz, - UIO_WRITE, uio, &nbytes))) { + ASSERT(arc_buf_size(abuf) == max_blksz); + if ((error = zfs_uiocopy(abuf->b_data, max_blksz, + UIO_WRITE, uio, &cbytes))) { dmu_return_arcbuf(abuf); break; } - ASSERT3S(nbytes, ==, blksz); - } else { - nbytes = MIN(n, (DMU_MAX_ACCESS >> 1) - - P2PHASE(woff, blksz)); - if (pfbytes < nbytes) { - if (zfs_uio_prefaultpages(nbytes, uio)) { - error = SET_ERROR(EFAULT); - break; - } - pfbytes = nbytes; - } + ASSERT3S(cbytes, ==, max_blksz); } /* @@ -603,7 +576,8 @@ dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); dmu_buf_impl_t *db = (dmu_buf_impl_t *)sa_get_db(zp->z_sa_hdl); DB_DNODE_ENTER(db); - dmu_tx_hold_write_by_dnode(tx, DB_DNODE(db), woff, nbytes); + dmu_tx_hold_write_by_dnode(tx, DB_DNODE(db), woff, + MIN(n, max_blksz)); DB_DNODE_EXIT(db); zfs_sa_upgrade_txholds(tx, zp); error = dmu_tx_assign(tx, TXG_WAIT); @@ -626,10 +600,31 @@ * shrink down lr_length to the appropriate size. */ if (lr->lr_length == UINT64_MAX) { - zfs_grow_blocksize(zp, blksz, tx); + uint64_t new_blksz; + + if (zp->z_blksz > max_blksz) { + /* + * File's blocksize is already larger than the + * "recordsize" property. Only let it grow to + * the next power of 2. + */ + ASSERT(!ISP2(zp->z_blksz)); + new_blksz = MIN(end_size, + 1 << highbit64(zp->z_blksz)); + } else { + new_blksz = MIN(end_size, max_blksz); + } + zfs_grow_blocksize(zp, new_blksz, tx); zfs_rangelock_reduce(lr, woff, n); } + /* + * XXX - should we really limit each write to z_max_blksz? + * Perhaps we should use SPA_MAXBLOCKSIZE chunks? + */ + const ssize_t nbytes = + MIN(n, max_blksz - P2PHASE(woff, max_blksz)); + ssize_t tx_bytes; if (abuf == NULL) { tx_bytes = zfs_uio_resid(uio); @@ -649,8 +644,12 @@ * zfs_uio_prefaultpages, or prefaultpages may * error, and we may break the loop early. */ - n -= tx_bytes - zfs_uio_resid(uio); - pfbytes -= tx_bytes - zfs_uio_resid(uio); + if (tx_bytes != zfs_uio_resid(uio)) + n -= tx_bytes - zfs_uio_resid(uio); + if (zfs_uio_prefaultpages(MIN(n, max_blksz), + uio)) { + break; + } continue; } #endif @@ -666,6 +665,15 @@ } tx_bytes -= zfs_uio_resid(uio); } else { + /* Implied by abuf != NULL: */ + ASSERT3S(n, >=, max_blksz); + ASSERT0(P2PHASE(woff, max_blksz)); + /* + * We can simplify nbytes to MIN(n, max_blksz) since + * P2PHASE(woff, max_blksz) is 0, and knowing + * n >= max_blksz lets us simplify further: + */ + ASSERT3S(nbytes, ==, max_blksz); /* * Thus, we're writing a full block at a block-aligned * offset and extending the file past EOF. @@ -750,7 +758,13 @@ break; ASSERT3S(tx_bytes, ==, nbytes); n -= nbytes; - pfbytes -= nbytes; + + if (n > 0) { + if (zfs_uio_prefaultpages(MIN(n, max_blksz), uio)) { + error = SET_ERROR(EFAULT); + break; + } + } } zfs_znode_update_vfs(zp); diff --git a/sys/contrib/openzfs/module/zfs/zil.c b/sys/contrib/openzfs/module/zfs/zil.c --- a/sys/contrib/openzfs/module/zfs/zil.c +++ b/sys/contrib/openzfs/module/zfs/zil.c @@ -116,12 +116,8 @@ { "zil_itx_needcopy_bytes", KSTAT_DATA_UINT64 }, { "zil_itx_metaslab_normal_count", KSTAT_DATA_UINT64 }, { "zil_itx_metaslab_normal_bytes", KSTAT_DATA_UINT64 }, - { "zil_itx_metaslab_normal_write", KSTAT_DATA_UINT64 }, - { "zil_itx_metaslab_normal_alloc", KSTAT_DATA_UINT64 }, { "zil_itx_metaslab_slog_count", KSTAT_DATA_UINT64 }, { "zil_itx_metaslab_slog_bytes", KSTAT_DATA_UINT64 }, - { "zil_itx_metaslab_slog_write", KSTAT_DATA_UINT64 }, - { "zil_itx_metaslab_slog_alloc", KSTAT_DATA_UINT64 }, }; static zil_sums_t zil_sums_global; @@ -150,10 +146,6 @@ static kmem_cache_t *zil_lwb_cache; static kmem_cache_t *zil_zcw_cache; -static void zil_lwb_commit(zilog_t *zilog, lwb_t *lwb, itx_t *itx); -static void zil_lwb_write_issue(zilog_t *zilog, lwb_t *lwb); -static itx_t *zil_itx_clone(itx_t *oitx); - static int zil_bp_compare(const void *x1, const void *x2) { @@ -249,10 +241,11 @@ */ static int zil_read_log_block(zilog_t *zilog, boolean_t decrypt, const blkptr_t *bp, - blkptr_t *nbp, char **begin, char **end, arc_buf_t **abuf) + blkptr_t *nbp, void *dst, char **end) { zio_flag_t zio_flags = ZIO_FLAG_CANFAIL; arc_flags_t aflags = ARC_FLAG_WAIT; + arc_buf_t *abuf = NULL; zbookmark_phys_t zb; int error; @@ -269,7 +262,7 @@ ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, bp->blk_cksum.zc_word[ZIL_ZC_SEQ]); error = arc_read(NULL, zilog->zl_spa, bp, arc_getbuf_func, - abuf, ZIO_PRIORITY_SYNC_READ, zio_flags, &aflags, &zb); + &abuf, ZIO_PRIORITY_SYNC_READ, zio_flags, &aflags, &zb); if (error == 0) { zio_cksum_t cksum = bp->blk_cksum; @@ -284,23 +277,23 @@ */ cksum.zc_word[ZIL_ZC_SEQ]++; - uint64_t size = BP_GET_LSIZE(bp); if (BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_ZILOG2) { - zil_chain_t *zilc = (*abuf)->b_data; + zil_chain_t *zilc = abuf->b_data; char *lr = (char *)(zilc + 1); + uint64_t len = zilc->zc_nused - sizeof (zil_chain_t); if (memcmp(&cksum, &zilc->zc_next_blk.blk_cksum, - sizeof (cksum)) || BP_IS_HOLE(&zilc->zc_next_blk) || - zilc->zc_nused < sizeof (*zilc) || - zilc->zc_nused > size) { + sizeof (cksum)) || BP_IS_HOLE(&zilc->zc_next_blk)) { error = SET_ERROR(ECKSUM); } else { - *begin = lr; - *end = lr + zilc->zc_nused - sizeof (*zilc); + ASSERT3U(len, <=, SPA_OLD_MAXBLOCKSIZE); + memcpy(dst, lr, len); + *end = (char *)dst + len; *nbp = zilc->zc_next_blk; } } else { - char *lr = (*abuf)->b_data; + char *lr = abuf->b_data; + uint64_t size = BP_GET_LSIZE(bp); zil_chain_t *zilc = (zil_chain_t *)(lr + size) - 1; if (memcmp(&cksum, &zilc->zc_next_blk.blk_cksum, @@ -308,11 +301,15 @@ (zilc->zc_nused > (size - sizeof (*zilc)))) { error = SET_ERROR(ECKSUM); } else { - *begin = lr; - *end = lr + zilc->zc_nused; + ASSERT3U(zilc->zc_nused, <=, + SPA_OLD_MAXBLOCKSIZE); + memcpy(dst, lr, zilc->zc_nused); + *end = (char *)dst + zilc->zc_nused; *nbp = zilc->zc_next_blk; } } + + arc_buf_destroy(abuf, &abuf); } return (error); @@ -378,12 +375,8 @@ wmsum_init(&zs->zil_itx_needcopy_bytes, 0); wmsum_init(&zs->zil_itx_metaslab_normal_count, 0); wmsum_init(&zs->zil_itx_metaslab_normal_bytes, 0); - wmsum_init(&zs->zil_itx_metaslab_normal_write, 0); - wmsum_init(&zs->zil_itx_metaslab_normal_alloc, 0); wmsum_init(&zs->zil_itx_metaslab_slog_count, 0); wmsum_init(&zs->zil_itx_metaslab_slog_bytes, 0); - wmsum_init(&zs->zil_itx_metaslab_slog_write, 0); - wmsum_init(&zs->zil_itx_metaslab_slog_alloc, 0); } void @@ -400,12 +393,8 @@ wmsum_fini(&zs->zil_itx_needcopy_bytes); wmsum_fini(&zs->zil_itx_metaslab_normal_count); wmsum_fini(&zs->zil_itx_metaslab_normal_bytes); - wmsum_fini(&zs->zil_itx_metaslab_normal_write); - wmsum_fini(&zs->zil_itx_metaslab_normal_alloc); wmsum_fini(&zs->zil_itx_metaslab_slog_count); wmsum_fini(&zs->zil_itx_metaslab_slog_bytes); - wmsum_fini(&zs->zil_itx_metaslab_slog_write); - wmsum_fini(&zs->zil_itx_metaslab_slog_alloc); } void @@ -433,18 +422,10 @@ wmsum_value(&zil_sums->zil_itx_metaslab_normal_count); zs->zil_itx_metaslab_normal_bytes.value.ui64 = wmsum_value(&zil_sums->zil_itx_metaslab_normal_bytes); - zs->zil_itx_metaslab_normal_write.value.ui64 = - wmsum_value(&zil_sums->zil_itx_metaslab_normal_write); - zs->zil_itx_metaslab_normal_alloc.value.ui64 = - wmsum_value(&zil_sums->zil_itx_metaslab_normal_alloc); zs->zil_itx_metaslab_slog_count.value.ui64 = wmsum_value(&zil_sums->zil_itx_metaslab_slog_count); zs->zil_itx_metaslab_slog_bytes.value.ui64 = wmsum_value(&zil_sums->zil_itx_metaslab_slog_bytes); - zs->zil_itx_metaslab_slog_write.value.ui64 = - wmsum_value(&zil_sums->zil_itx_metaslab_slog_write); - zs->zil_itx_metaslab_slog_alloc.value.ui64 = - wmsum_value(&zil_sums->zil_itx_metaslab_slog_alloc); } /* @@ -464,6 +445,7 @@ uint64_t blk_count = 0; uint64_t lr_count = 0; blkptr_t blk, next_blk = {{{{0}}}}; + char *lrbuf, *lrp; int error = 0; /* @@ -481,13 +463,13 @@ * If the log has been claimed, stop if we encounter a sequence * number greater than the highest claimed sequence number. */ + lrbuf = zio_buf_alloc(SPA_OLD_MAXBLOCKSIZE); zil_bp_tree_init(zilog); for (blk = zh->zh_log; !BP_IS_HOLE(&blk); blk = next_blk) { uint64_t blk_seq = blk.blk_cksum.zc_word[ZIL_ZC_SEQ]; int reclen; - char *lrp, *end; - arc_buf_t *abuf = NULL; + char *end = NULL; if (blk_seq > claim_blk_seq) break; @@ -503,10 +485,8 @@ break; error = zil_read_log_block(zilog, decrypt, &blk, &next_blk, - &lrp, &end, &abuf); + lrbuf, &end); if (error != 0) { - if (abuf) - arc_buf_destroy(abuf, &abuf); if (claimed) { char name[ZFS_MAX_DATASET_NAME_LEN]; @@ -519,25 +499,20 @@ break; } - for (; lrp < end; lrp += reclen) { + for (lrp = lrbuf; lrp < end; lrp += reclen) { lr_t *lr = (lr_t *)lrp; reclen = lr->lrc_reclen; ASSERT3U(reclen, >=, sizeof (lr_t)); - if (lr->lrc_seq > claim_lr_seq) { - arc_buf_destroy(abuf, &abuf); + if (lr->lrc_seq > claim_lr_seq) goto done; - } error = parse_lr_func(zilog, lr, arg, txg); - if (error != 0) { - arc_buf_destroy(abuf, &abuf); + if (error != 0) goto done; - } ASSERT3U(max_lr_seq, <, lr->lrc_seq); max_lr_seq = lr->lrc_seq; lr_count++; } - arc_buf_destroy(abuf, &abuf); } done: zilog->zl_parse_error = error; @@ -547,6 +522,7 @@ zilog->zl_parse_lr_count = lr_count; zil_bp_tree_fini(zilog); + zio_buf_free(lrbuf, SPA_OLD_MAXBLOCKSIZE); return (error); } @@ -771,21 +747,20 @@ lwb->lwb_blk = *bp; lwb->lwb_fastwrite = fastwrite; lwb->lwb_slog = slog; - lwb->lwb_indirect = B_FALSE; - if (BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_ZILOG2) { - lwb->lwb_nused = lwb->lwb_nfilled = sizeof (zil_chain_t); - lwb->lwb_sz = BP_GET_LSIZE(bp); - } else { - lwb->lwb_nused = lwb->lwb_nfilled = 0; - lwb->lwb_sz = BP_GET_LSIZE(bp) - sizeof (zil_chain_t); - } lwb->lwb_state = LWB_STATE_CLOSED; lwb->lwb_buf = zio_buf_alloc(BP_GET_LSIZE(bp)); + lwb->lwb_max_txg = txg; lwb->lwb_write_zio = NULL; lwb->lwb_root_zio = NULL; lwb->lwb_issued_timestamp = 0; lwb->lwb_issued_txg = 0; - lwb->lwb_max_txg = txg; + if (BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_ZILOG2) { + lwb->lwb_nused = sizeof (zil_chain_t); + lwb->lwb_sz = BP_GET_LSIZE(bp); + } else { + lwb->lwb_nused = 0; + lwb->lwb_sz = BP_GET_LSIZE(bp) - sizeof (zil_chain_t); + } mutex_enter(&zilog->zl_lock); list_insert_tail(&zilog->zl_lwb_list, lwb); @@ -799,8 +774,8 @@ { ASSERT(MUTEX_HELD(&zilog->zl_lock)); ASSERT(!MUTEX_HELD(&lwb->lwb_vdev_lock)); - VERIFY(list_is_empty(&lwb->lwb_waiters)); - VERIFY(list_is_empty(&lwb->lwb_itxs)); + ASSERT(list_is_empty(&lwb->lwb_waiters)); + ASSERT(list_is_empty(&lwb->lwb_itxs)); ASSERT(avl_is_empty(&lwb->lwb_vdev_tree)); ASSERT3P(lwb->lwb_write_zio, ==, NULL); ASSERT3P(lwb->lwb_root_zio, ==, NULL); @@ -1398,14 +1373,9 @@ zil_commit_waiter_t *zcw; itx_t *itx; uint64_t txg; - list_t itxs, waiters; spa_config_exit(zilog->zl_spa, SCL_STATE, lwb); - list_create(&itxs, sizeof (itx_t), offsetof(itx_t, itx_node)); - list_create(&waiters, sizeof (zil_commit_waiter_t), - offsetof(zil_commit_waiter_t, zcw_node)); - hrtime_t t = gethrtime() - lwb->lwb_issued_timestamp; mutex_enter(&zilog->zl_lock); @@ -1414,6 +1384,9 @@ lwb->lwb_root_zio = NULL; + ASSERT3S(lwb->lwb_state, ==, LWB_STATE_WRITE_DONE); + lwb->lwb_state = LWB_STATE_FLUSH_DONE; + if (zilog->zl_last_lwb_opened == lwb) { /* * Remember the highest committed log sequence number @@ -1424,22 +1397,13 @@ zilog->zl_commit_lr_seq = zilog->zl_lr_seq; } - list_move_tail(&itxs, &lwb->lwb_itxs); - list_move_tail(&waiters, &lwb->lwb_waiters); - txg = lwb->lwb_issued_txg; - - ASSERT3S(lwb->lwb_state, ==, LWB_STATE_WRITE_DONE); - lwb->lwb_state = LWB_STATE_FLUSH_DONE; - - mutex_exit(&zilog->zl_lock); - - while ((itx = list_remove_head(&itxs)) != NULL) + while ((itx = list_remove_head(&lwb->lwb_itxs)) != NULL) zil_itx_destroy(itx); - list_destroy(&itxs); - while ((zcw = list_remove_head(&waiters)) != NULL) { + while ((zcw = list_remove_head(&lwb->lwb_waiters)) != NULL) { mutex_enter(&zcw->zcw_lock); + ASSERT3P(zcw->zcw_lwb, ==, lwb); zcw->zcw_lwb = NULL; /* * We expect any ZIO errors from child ZIOs to have been @@ -1464,9 +1428,11 @@ mutex_exit(&zcw->zcw_lock); } - list_destroy(&waiters); + + mutex_exit(&zilog->zl_lock); mutex_enter(&zilog->zl_lwb_io_lock); + txg = lwb->lwb_issued_txg; ASSERT3U(zilog->zl_lwb_inflight[txg & TXG_MASK], >, 0); zilog->zl_lwb_inflight[txg & TXG_MASK]--; if (zilog->zl_lwb_inflight[txg & TXG_MASK] == 0) @@ -1700,41 +1666,46 @@ EQUIV(lwb->lwb_root_zio == NULL, lwb->lwb_state == LWB_STATE_CLOSED); EQUIV(lwb->lwb_root_zio != NULL, lwb->lwb_state == LWB_STATE_OPENED); - if (lwb->lwb_root_zio != NULL) - return; - - lwb->lwb_root_zio = zio_root(zilog->zl_spa, - zil_lwb_flush_vdevs_done, lwb, ZIO_FLAG_CANFAIL); - - abd_t *lwb_abd = abd_get_from_buf(lwb->lwb_buf, - BP_GET_LSIZE(&lwb->lwb_blk)); - - if (!lwb->lwb_slog || zilog->zl_cur_used <= zil_slog_bulk) - prio = ZIO_PRIORITY_SYNC_WRITE; - else - prio = ZIO_PRIORITY_ASYNC_WRITE; - SET_BOOKMARK(&zb, lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_OBJSET], ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_SEQ]); /* Lock so zil_sync() doesn't fastwrite_unmark after zio is created */ mutex_enter(&zilog->zl_lock); - if (!lwb->lwb_fastwrite) { - metaslab_fastwrite_mark(zilog->zl_spa, &lwb->lwb_blk); - lwb->lwb_fastwrite = 1; - } + if (lwb->lwb_root_zio == NULL) { + abd_t *lwb_abd = abd_get_from_buf(lwb->lwb_buf, + BP_GET_LSIZE(&lwb->lwb_blk)); + + if (!lwb->lwb_fastwrite) { + metaslab_fastwrite_mark(zilog->zl_spa, &lwb->lwb_blk); + lwb->lwb_fastwrite = 1; + } + + if (!lwb->lwb_slog || zilog->zl_cur_used <= zil_slog_bulk) + prio = ZIO_PRIORITY_SYNC_WRITE; + else + prio = ZIO_PRIORITY_ASYNC_WRITE; - lwb->lwb_write_zio = zio_rewrite(lwb->lwb_root_zio, zilog->zl_spa, 0, - &lwb->lwb_blk, lwb_abd, BP_GET_LSIZE(&lwb->lwb_blk), - zil_lwb_write_done, lwb, prio, - ZIO_FLAG_CANFAIL | ZIO_FLAG_FASTWRITE, &zb); + lwb->lwb_root_zio = zio_root(zilog->zl_spa, + zil_lwb_flush_vdevs_done, lwb, ZIO_FLAG_CANFAIL); + ASSERT3P(lwb->lwb_root_zio, !=, NULL); - lwb->lwb_state = LWB_STATE_OPENED; + lwb->lwb_write_zio = zio_rewrite(lwb->lwb_root_zio, + zilog->zl_spa, 0, &lwb->lwb_blk, lwb_abd, + BP_GET_LSIZE(&lwb->lwb_blk), zil_lwb_write_done, lwb, + prio, ZIO_FLAG_CANFAIL | ZIO_FLAG_FASTWRITE, &zb); + ASSERT3P(lwb->lwb_write_zio, !=, NULL); - zil_lwb_set_zio_dependency(zilog, lwb); - zilog->zl_last_lwb_opened = lwb; + lwb->lwb_state = LWB_STATE_OPENED; + + zil_lwb_set_zio_dependency(zilog, lwb); + zilog->zl_last_lwb_opened = lwb; + } mutex_exit(&zilog->zl_lock); + + ASSERT3P(lwb->lwb_root_zio, !=, NULL); + ASSERT3P(lwb->lwb_write_zio, !=, NULL); + ASSERT3S(lwb->lwb_state, ==, LWB_STATE_OPENED); } /* @@ -1765,11 +1736,11 @@ static uint_t zil_maxblocksize = SPA_OLD_MAXBLOCKSIZE; /* - * Close the log block for being issued and allocate the next one. - * Has to be called under zl_issuer_lock to chain more lwbs. + * Start a log block write and advance to the next log block. + * Calls are serialized. */ static lwb_t * -zil_lwb_write_close(zilog_t *zilog, lwb_t *lwb, list_t *ilwbs) +zil_lwb_write_issue(zilog_t *zilog, lwb_t *lwb) { lwb_t *nlwb = NULL; zil_chain_t *zilc; @@ -1777,7 +1748,7 @@ blkptr_t *bp; dmu_tx_t *tx; uint64_t txg; - uint64_t zil_blksz; + uint64_t zil_blksz, wsz; int i, error; boolean_t slog; @@ -1786,17 +1757,16 @@ ASSERT3P(lwb->lwb_write_zio, !=, NULL); ASSERT3S(lwb->lwb_state, ==, LWB_STATE_OPENED); - /* - * If this lwb includes indirect writes, we have to commit before - * creating the transaction, otherwise we may end up in dead lock. - */ - if (lwb->lwb_indirect) { - for (itx_t *itx = list_head(&lwb->lwb_itxs); itx; - itx = list_next(&lwb->lwb_itxs, itx)) - zil_lwb_commit(zilog, lwb, itx); - lwb->lwb_nused = lwb->lwb_nfilled; + if (BP_GET_CHECKSUM(&lwb->lwb_blk) == ZIO_CHECKSUM_ZILOG2) { + zilc = (zil_chain_t *)lwb->lwb_buf; + bp = &zilc->zc_next_blk; + } else { + zilc = (zil_chain_t *)(lwb->lwb_buf + lwb->lwb_sz); + bp = &zilc->zc_next_blk; } + ASSERT(lwb->lwb_nused <= lwb->lwb_sz); + /* * Allocate the next block and save its address in this block * before writing it in order to establish the log chain. @@ -1844,18 +1814,19 @@ zilog->zl_prev_blks[zilog->zl_prev_rotor] = zil_blksz; for (i = 0; i < ZIL_PREV_BLKS; i++) zil_blksz = MAX(zil_blksz, zilog->zl_prev_blks[i]); - DTRACE_PROBE3(zil__block__size, zilog_t *, zilog, - uint64_t, zil_blksz, - uint64_t, zilog->zl_prev_blks[zilog->zl_prev_rotor]); zilog->zl_prev_rotor = (zilog->zl_prev_rotor + 1) & (ZIL_PREV_BLKS - 1); - if (BP_GET_CHECKSUM(&lwb->lwb_blk) == ZIO_CHECKSUM_ZILOG2) - zilc = (zil_chain_t *)lwb->lwb_buf; - else - zilc = (zil_chain_t *)(lwb->lwb_buf + lwb->lwb_sz); - bp = &zilc->zc_next_blk; BP_ZERO(bp); error = zio_alloc_zil(spa, zilog->zl_os, txg, bp, zil_blksz, &slog); + if (slog) { + ZIL_STAT_BUMP(zilog, zil_itx_metaslab_slog_count); + ZIL_STAT_INCR(zilog, zil_itx_metaslab_slog_bytes, + lwb->lwb_nused); + } else { + ZIL_STAT_BUMP(zilog, zil_itx_metaslab_normal_count); + ZIL_STAT_INCR(zilog, zil_itx_metaslab_normal_bytes, + lwb->lwb_nused); + } if (error == 0) { ASSERT3U(bp->blk_birth, ==, txg); bp->blk_cksum = lwb->lwb_blk.blk_cksum; @@ -1867,68 +1838,17 @@ nlwb = zil_alloc_lwb(zilog, bp, slog, txg, TRUE); } - lwb->lwb_state = LWB_STATE_ISSUED; - - dmu_tx_commit(tx); - - /* - * We need to acquire the config lock for the lwb to issue it later. - * However, if we already have a queue of closed parent lwbs already - * holding the config lock (but not yet issued), we can't block here - * waiting on the lock or we will deadlock. In that case we must - * first issue to parent IOs before waiting on the lock. - */ - if (ilwbs && !list_is_empty(ilwbs)) { - if (!spa_config_tryenter(spa, SCL_STATE, lwb, RW_READER)) { - lwb_t *tlwb; - while ((tlwb = list_remove_head(ilwbs)) != NULL) - zil_lwb_write_issue(zilog, tlwb); - spa_config_enter(spa, SCL_STATE, lwb, RW_READER); - } - } else { - spa_config_enter(spa, SCL_STATE, lwb, RW_READER); - } - - if (ilwbs) - list_insert_tail(ilwbs, lwb); - - /* - * If there was an allocation failure then nlwb will be null which - * forces a txg_wait_synced(). - */ - return (nlwb); -} - -/* - * Finalize previously closed block and issue the write zio. - * Does not require locking. - */ -static void -zil_lwb_write_issue(zilog_t *zilog, lwb_t *lwb) -{ - zil_chain_t *zilc; - int wsz; - - /* Actually fill the lwb with the data if not yet. */ - if (!lwb->lwb_indirect) { - for (itx_t *itx = list_head(&lwb->lwb_itxs); itx; - itx = list_next(&lwb->lwb_itxs, itx)) - zil_lwb_commit(zilog, lwb, itx); - lwb->lwb_nused = lwb->lwb_nfilled; - } - if (BP_GET_CHECKSUM(&lwb->lwb_blk) == ZIO_CHECKSUM_ZILOG2) { /* For Slim ZIL only write what is used. */ - wsz = P2ROUNDUP_TYPED(lwb->lwb_nused, ZIL_MIN_BLKSZ, int); - ASSERT3S(wsz, <=, lwb->lwb_sz); + wsz = P2ROUNDUP_TYPED(lwb->lwb_nused, ZIL_MIN_BLKSZ, uint64_t); + ASSERT3U(wsz, <=, lwb->lwb_sz); zio_shrink(lwb->lwb_write_zio, wsz); wsz = lwb->lwb_write_zio->io_size; - zilc = (zil_chain_t *)lwb->lwb_buf; } else { wsz = lwb->lwb_sz; - zilc = (zil_chain_t *)(lwb->lwb_buf + lwb->lwb_sz); } + zilc->zc_pad = 0; zilc->zc_nused = lwb->lwb_nused; zilc->zc_eck.zec_cksum = lwb->lwb_blk.blk_cksum; @@ -1938,28 +1858,22 @@ */ memset(lwb->lwb_buf + lwb->lwb_nused, 0, wsz - lwb->lwb_nused); - if (lwb->lwb_slog) { - ZIL_STAT_BUMP(zilog, zil_itx_metaslab_slog_count); - ZIL_STAT_INCR(zilog, zil_itx_metaslab_slog_bytes, - lwb->lwb_nused); - ZIL_STAT_INCR(zilog, zil_itx_metaslab_slog_write, - wsz); - ZIL_STAT_INCR(zilog, zil_itx_metaslab_slog_alloc, - BP_GET_LSIZE(&lwb->lwb_blk)); - } else { - ZIL_STAT_BUMP(zilog, zil_itx_metaslab_normal_count); - ZIL_STAT_INCR(zilog, zil_itx_metaslab_normal_bytes, - lwb->lwb_nused); - ZIL_STAT_INCR(zilog, zil_itx_metaslab_normal_write, - wsz); - ZIL_STAT_INCR(zilog, zil_itx_metaslab_normal_alloc, - BP_GET_LSIZE(&lwb->lwb_blk)); - } - ASSERT(spa_config_held(zilog->zl_spa, SCL_STATE, RW_READER)); + spa_config_enter(zilog->zl_spa, SCL_STATE, lwb, RW_READER); + zil_lwb_add_block(lwb, &lwb->lwb_blk); lwb->lwb_issued_timestamp = gethrtime(); + lwb->lwb_state = LWB_STATE_ISSUED; + zio_nowait(lwb->lwb_root_zio); zio_nowait(lwb->lwb_write_zio); + + dmu_tx_commit(tx); + + /* + * If there was an allocation failure then nlwb will be null which + * forces a txg_wait_synced(). + */ + return (nlwb); } /* @@ -1995,19 +1909,13 @@ sizeof (lr_write_t)); } -/* - * Estimate space needed in the lwb for the itx. Allocate more lwbs or - * split the itx as needed, but don't touch the actual transaction data. - * Has to be called under zl_issuer_lock to call zil_lwb_write_close() - * to chain more lwbs. - */ static lwb_t * -zil_lwb_assign(zilog_t *zilog, lwb_t *lwb, itx_t *itx, list_t *ilwbs) +zil_lwb_commit(zilog_t *zilog, itx_t *itx, lwb_t *lwb) { - itx_t *citx; - lr_t *lr, *clr; - lr_write_t *lrw; - uint64_t dlen, dnow, lwb_sp, reclen, max_log_data; + lr_t *lrcb, *lrc; + lr_write_t *lrwb, *lrw; + char *lr_buf; + uint64_t dlen, dnow, dpad, lwb_sp, reclen, txg, max_log_data; ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock)); ASSERT3P(lwb, !=, NULL); @@ -2015,8 +1923,8 @@ zil_lwb_write_open(zilog, lwb); - lr = &itx->itx_lr; - lrw = (lr_write_t *)lr; + lrc = &itx->itx_lr; + lrw = (lr_write_t *)lrc; /* * A commit itx doesn't represent any on-disk state; instead @@ -2030,23 +1938,24 @@ * * For more details, see the comment above zil_commit(). */ - if (lr->lrc_txtype == TX_COMMIT) { + if (lrc->lrc_txtype == TX_COMMIT) { mutex_enter(&zilog->zl_lock); zil_commit_waiter_link_lwb(itx->itx_private, lwb); itx->itx_private = NULL; mutex_exit(&zilog->zl_lock); - list_insert_tail(&lwb->lwb_itxs, itx); return (lwb); } - if (lr->lrc_txtype == TX_WRITE && itx->itx_wr_state == WR_NEED_COPY) { + if (lrc->lrc_txtype == TX_WRITE && itx->itx_wr_state == WR_NEED_COPY) { dlen = P2ROUNDUP_TYPED( lrw->lr_length, sizeof (uint64_t), uint64_t); + dpad = dlen - lrw->lr_length; } else { - dlen = 0; + dlen = dpad = 0; } - reclen = lr->lrc_reclen; + reclen = lrc->lrc_reclen; zilog->zl_cur_used += (reclen + dlen); + txg = lrc->lrc_txg; cont: /* @@ -2059,7 +1968,7 @@ lwb_sp < zil_max_waste_space(zilog) && (dlen % max_log_data == 0 || lwb_sp < reclen + dlen % max_log_data))) { - lwb = zil_lwb_write_close(zilog, lwb, ilwbs); + lwb = zil_lwb_write_issue(zilog, lwb); if (lwb == NULL) return (NULL); zil_lwb_write_open(zilog, lwb); @@ -2078,99 +1987,19 @@ } dnow = MIN(dlen, lwb_sp - reclen); - if (dlen > dnow) { - ASSERT3U(lr->lrc_txtype, ==, TX_WRITE); - ASSERT3U(itx->itx_wr_state, ==, WR_NEED_COPY); - citx = zil_itx_clone(itx); - clr = &citx->itx_lr; - lr_write_t *clrw = (lr_write_t *)clr; - clrw->lr_length = dnow; - lrw->lr_offset += dnow; - lrw->lr_length -= dnow; - } else { - citx = itx; - clr = lr; - } - - /* - * We're actually making an entry, so update lrc_seq to be the - * log record sequence number. Note that this is generally not - * equal to the itx sequence number because not all transactions - * are synchronous, and sometimes spa_sync() gets there first. - */ - clr->lrc_seq = ++zilog->zl_lr_seq; - - lwb->lwb_nused += reclen + dnow; - ASSERT3U(lwb->lwb_nused, <=, lwb->lwb_sz); - ASSERT0(P2PHASE(lwb->lwb_nused, sizeof (uint64_t))); - - zil_lwb_add_txg(lwb, lr->lrc_txg); - list_insert_tail(&lwb->lwb_itxs, citx); - - dlen -= dnow; - if (dlen > 0) { - zilog->zl_cur_used += reclen; - goto cont; - } - - /* - * We have to really issue all queued LWBs before we may have to - * wait for a txg sync. Otherwise we may end up in a dead lock. - */ - if (lr->lrc_txtype == TX_WRITE) { - boolean_t frozen = lr->lrc_txg > spa_freeze_txg(zilog->zl_spa); - if (frozen || itx->itx_wr_state == WR_INDIRECT) { - lwb_t *tlwb; - while ((tlwb = list_remove_head(ilwbs)) != NULL) - zil_lwb_write_issue(zilog, tlwb); - } - if (itx->itx_wr_state == WR_INDIRECT) - lwb->lwb_indirect = B_TRUE; - if (frozen) - txg_wait_synced(zilog->zl_dmu_pool, lr->lrc_txg); - } - - return (lwb); -} - -/* - * Fill the actual transaction data into the lwb, following zil_lwb_assign(). - * Does not require locking. - */ -static void -zil_lwb_commit(zilog_t *zilog, lwb_t *lwb, itx_t *itx) -{ - lr_t *lr, *lrb; - lr_write_t *lrw, *lrwb; - char *lr_buf; - uint64_t dlen, reclen; - - lr = &itx->itx_lr; - lrw = (lr_write_t *)lr; - - if (lr->lrc_txtype == TX_COMMIT) - return; - - if (lr->lrc_txtype == TX_WRITE && itx->itx_wr_state == WR_NEED_COPY) { - dlen = P2ROUNDUP_TYPED( - lrw->lr_length, sizeof (uint64_t), uint64_t); - } else { - dlen = 0; - } - reclen = lr->lrc_reclen; - ASSERT3U(reclen + dlen, <=, lwb->lwb_nused - lwb->lwb_nfilled); - - lr_buf = lwb->lwb_buf + lwb->lwb_nfilled; - memcpy(lr_buf, lr, reclen); - lrb = (lr_t *)lr_buf; /* Like lr, but inside lwb. */ - lrwb = (lr_write_t *)lrb; /* Like lrw, but inside lwb. */ + lr_buf = lwb->lwb_buf + lwb->lwb_nused; + memcpy(lr_buf, lrc, reclen); + lrcb = (lr_t *)lr_buf; /* Like lrc, but inside lwb. */ + lrwb = (lr_write_t *)lrcb; /* Like lrw, but inside lwb. */ ZIL_STAT_BUMP(zilog, zil_itx_count); /* * If it's a write, fetch the data or get its blkptr as appropriate. */ - if (lr->lrc_txtype == TX_WRITE) { + if (lrc->lrc_txtype == TX_WRITE) { + if (txg > spa_freeze_txg(zilog->zl_spa)) + txg_wait_synced(zilog->zl_dmu_pool, txg); if (itx->itx_wr_state == WR_COPIED) { ZIL_STAT_BUMP(zilog, zil_itx_copied_count); ZIL_STAT_INCR(zilog, zil_itx_copied_bytes, @@ -2181,10 +2010,14 @@ if (itx->itx_wr_state == WR_NEED_COPY) { dbuf = lr_buf + reclen; - lrb->lrc_reclen += dlen; + lrcb->lrc_reclen += dnow; + if (lrwb->lr_length > dnow) + lrwb->lr_length = dnow; + lrw->lr_offset += dnow; + lrw->lr_length -= dnow; ZIL_STAT_BUMP(zilog, zil_itx_needcopy_count); ZIL_STAT_INCR(zilog, zil_itx_needcopy_bytes, - dlen); + dnow); } else { ASSERT3S(itx->itx_wr_state, ==, WR_INDIRECT); dbuf = NULL; @@ -2211,11 +2044,9 @@ error = zilog->zl_get_data(itx->itx_private, itx->itx_gen, lrwb, dbuf, lwb, lwb->lwb_write_zio); - if (dbuf != NULL && error == 0) { + if (dbuf != NULL && error == 0 && dnow == dlen) /* Zero any padding bytes in the last block. */ - memset((char *)dbuf + lrwb->lr_length, 0, - dlen - lrwb->lr_length); - } + memset((char *)dbuf + lrwb->lr_length, 0, dpad); /* * Typically, the only return values we should see from @@ -2243,26 +2074,39 @@ error); zfs_fallthrough; case EIO: - if (lwb->lwb_indirect) { - txg_wait_synced(zilog->zl_dmu_pool, - lr->lrc_txg); - } else { - lwb->lwb_write_zio->io_error = error; - } + txg_wait_synced(zilog->zl_dmu_pool, txg); zfs_fallthrough; case ENOENT: zfs_fallthrough; case EEXIST: zfs_fallthrough; case EALREADY: - return; + return (lwb); } } } - lwb->lwb_nfilled += reclen + dlen; - ASSERT3S(lwb->lwb_nfilled, <=, lwb->lwb_nused); - ASSERT0(P2PHASE(lwb->lwb_nfilled, sizeof (uint64_t))); + /* + * We're actually making an entry, so update lrc_seq to be the + * log record sequence number. Note that this is generally not + * equal to the itx sequence number because not all transactions + * are synchronous, and sometimes spa_sync() gets there first. + */ + lrcb->lrc_seq = ++zilog->zl_lr_seq; + lwb->lwb_nused += reclen + dnow; + + zil_lwb_add_txg(lwb, txg); + + ASSERT3U(lwb->lwb_nused, <=, lwb->lwb_sz); + ASSERT0(P2PHASE(lwb->lwb_nused, sizeof (uint64_t))); + + dlen -= dnow; + if (dlen > 0) { + zilog->zl_cur_used += reclen; + goto cont; + } + + return (lwb); } itx_t * @@ -2287,16 +2131,6 @@ return (itx); } -static itx_t * -zil_itx_clone(itx_t *oitx) -{ - itx_t *itx = zio_data_buf_alloc(oitx->itx_size); - memcpy(itx, oitx, oitx->itx_size); - itx->itx_callback = NULL; - itx->itx_callback_data = NULL; - return (itx); -} - void zil_itx_destroy(itx_t *itx) { @@ -2328,7 +2162,7 @@ /* * In the general case, commit itxs will not be found * here, as they'll be committed to an lwb via - * zil_lwb_assign(), and free'd in that function. Having + * zil_lwb_commit(), and free'd in that function. Having * said that, it is still possible for commit itxs to be * found here, due to the following race: * @@ -2546,10 +2380,10 @@ * This function will traverse the queue of itxs that need to be * committed, and move them onto the ZIL's zl_itx_commit_list. */ -static uint64_t +static void zil_get_commit_list(zilog_t *zilog) { - uint64_t otxg, txg, wtxg = 0; + uint64_t otxg, txg; list_t *commit_list = &zilog->zl_itx_commit_list; ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock)); @@ -2583,22 +2417,10 @@ */ ASSERT(zilog_is_dirty_in_txg(zilog, txg) || spa_freeze_txg(zilog->zl_spa) != UINT64_MAX); - list_t *sync_list = &itxg->itxg_itxs->i_sync_list; - if (unlikely(zilog->zl_suspend > 0)) { - /* - * ZIL was just suspended, but we lost the race. - * Allow all earlier itxs to be committed, but ask - * caller to do txg_wait_synced(txg) for any new. - */ - if (!list_is_empty(sync_list)) - wtxg = MAX(wtxg, txg); - } else { - list_move_tail(commit_list, sync_list); - } + list_move_tail(commit_list, &itxg->itxg_itxs->i_sync_list); mutex_exit(&itxg->itxg_lock); } - return (wtxg); } /* @@ -2739,7 +2561,7 @@ * lwb will be issued to the zio layer to be written to disk. */ static void -zil_process_commit_list(zilog_t *zilog, zil_commit_waiter_t *zcw, list_t *ilwbs) +zil_process_commit_list(zilog_t *zilog) { spa_t *spa = zilog->zl_spa; list_t nolwb_itxs; @@ -2841,23 +2663,18 @@ */ if (frozen || !synced || lrc->lrc_txtype == TX_COMMIT) { if (lwb != NULL) { - lwb = zil_lwb_assign(zilog, lwb, itx, ilwbs); - if (lwb == NULL) { + lwb = zil_lwb_commit(zilog, itx, lwb); + + if (lwb == NULL) list_insert_tail(&nolwb_itxs, itx); - } else if ((zcw->zcw_lwb != NULL && - zcw->zcw_lwb != lwb) || zcw->zcw_done) { - /* - * Our lwb is done, leave the rest of - * itx list to somebody else who care. - */ - first = B_FALSE; - break; - } + else + list_insert_tail(&lwb->lwb_itxs, itx); } else { if (lrc->lrc_txtype == TX_COMMIT) { zil_commit_waiter_link_nolwb( itx->itx_private, &nolwb_waiters); } + list_insert_tail(&nolwb_itxs, itx); } } else { @@ -2873,8 +2690,6 @@ * the ZIL write pipeline; see the comment within * zil_commit_writer_stall() for more details. */ - while ((lwb = list_remove_head(ilwbs)) != NULL) - zil_lwb_write_issue(zilog, lwb); zil_commit_writer_stall(zilog); /* @@ -2920,13 +2735,13 @@ * on the system, such that this function will be * immediately called again (not necessarily by the same * thread) and this lwb's zio will be issued via - * zil_lwb_assign(). This way, the lwb is guaranteed to + * zil_lwb_commit(). This way, the lwb is guaranteed to * be "full" when it is issued to disk, and we'll make * use of the lwb's size the best we can. * * 2. If there isn't sufficient ZIL activity occurring on * the system, such that this lwb's zio isn't issued via - * zil_lwb_assign(), zil_commit_waiter() will issue the + * zil_lwb_commit(), zil_commit_waiter() will issue the * lwb's zio. If this occurs, the lwb is not guaranteed * to be "full" by the time its zio is issued, and means * the size of the lwb was "too large" given the amount @@ -2958,14 +2773,10 @@ zfs_commit_timeout_pct / 100; if (sleep < zil_min_commit_timeout || lwb->lwb_sz - lwb->lwb_nused < lwb->lwb_sz / 8) { - lwb = zil_lwb_write_close(zilog, lwb, ilwbs); + lwb = zil_lwb_write_issue(zilog, lwb); zilog->zl_cur_used = 0; - if (lwb == NULL) { - while ((lwb = list_remove_head(ilwbs)) - != NULL) - zil_lwb_write_issue(zilog, lwb); + if (lwb == NULL) zil_commit_writer_stall(zilog); - } } } } @@ -2985,17 +2796,12 @@ * not issued, we rely on future calls to zil_commit_writer() to issue * the lwb, or the timeout mechanism found in zil_commit_waiter(). */ -static uint64_t +static void zil_commit_writer(zilog_t *zilog, zil_commit_waiter_t *zcw) { - list_t ilwbs; - lwb_t *lwb; - uint64_t wtxg = 0; - ASSERT(!MUTEX_HELD(&zilog->zl_lock)); ASSERT(spa_writeable(zilog->zl_spa)); - list_create(&ilwbs, sizeof (lwb_t), offsetof(lwb_t, lwb_issue_node)); mutex_enter(&zilog->zl_issuer_lock); if (zcw->zcw_lwb != NULL || zcw->zcw_done) { @@ -3020,16 +2826,12 @@ ZIL_STAT_BUMP(zilog, zil_commit_writer_count); - wtxg = zil_get_commit_list(zilog); + zil_get_commit_list(zilog); zil_prune_commit_list(zilog); - zil_process_commit_list(zilog, zcw, &ilwbs); + zil_process_commit_list(zilog); out: mutex_exit(&zilog->zl_issuer_lock); - while ((lwb = list_remove_head(&ilwbs)) != NULL) - zil_lwb_write_issue(zilog, lwb); - list_destroy(&ilwbs); - return (wtxg); } static void @@ -3056,7 +2858,7 @@ return; /* - * In order to call zil_lwb_write_close() we must hold the + * In order to call zil_lwb_write_issue() we must hold the * zilog's "zl_issuer_lock". We can't simply acquire that lock, * since we're already holding the commit waiter's "zcw_lock", * and those two locks are acquired in the opposite order @@ -3074,10 +2876,8 @@ * the waiter is marked "done"), so without this check we could * wind up with a use-after-free error below. */ - if (zcw->zcw_done) { - lwb = NULL; + if (zcw->zcw_done) goto out; - } ASSERT3P(lwb, ==, zcw->zcw_lwb); @@ -3096,17 +2896,15 @@ * if it's ISSUED or OPENED, and block any other threads that might * attempt to issue this lwb. For that reason we hold the * zl_issuer_lock when checking the lwb_state; we must not call - * zil_lwb_write_close() if the lwb had already been issued. + * zil_lwb_write_issue() if the lwb had already been issued. * * See the comment above the lwb_state_t structure definition for * more details on the lwb states, and locking requirements. */ if (lwb->lwb_state == LWB_STATE_ISSUED || lwb->lwb_state == LWB_STATE_WRITE_DONE || - lwb->lwb_state == LWB_STATE_FLUSH_DONE) { - lwb = NULL; + lwb->lwb_state == LWB_STATE_FLUSH_DONE) goto out; - } ASSERT3S(lwb->lwb_state, ==, LWB_STATE_OPENED); @@ -3116,7 +2914,7 @@ * since we've reached the commit waiter's timeout and it still * hasn't been issued. */ - lwb_t *nlwb = zil_lwb_write_close(zilog, lwb, NULL); + lwb_t *nlwb = zil_lwb_write_issue(zilog, lwb); ASSERT3S(lwb->lwb_state, !=, LWB_STATE_OPENED); @@ -3136,7 +2934,7 @@ if (nlwb == NULL) { /* - * When zil_lwb_write_close() returns NULL, this + * When zil_lwb_write_issue() returns NULL, this * indicates zio_alloc_zil() failed to allocate the * "next" lwb on-disk. When this occurs, the ZIL write * pipeline must be stalled; see the comment within the @@ -3158,16 +2956,12 @@ * lock, which occurs prior to calling dmu_tx_commit() */ mutex_exit(&zcw->zcw_lock); - zil_lwb_write_issue(zilog, lwb); - lwb = NULL; zil_commit_writer_stall(zilog); mutex_enter(&zcw->zcw_lock); } out: mutex_exit(&zilog->zl_issuer_lock); - if (lwb) - zil_lwb_write_issue(zilog, lwb); ASSERT(MUTEX_HELD(&zcw->zcw_lock)); } @@ -3182,7 +2976,7 @@ * waited "long enough" and the lwb is still in the "open" state. * * Given a sufficient amount of itxs being generated and written using - * the ZIL, the lwb's zio will be issued via the zil_lwb_assign() + * the ZIL, the lwb's zio will be issued via the zil_lwb_commit() * function. If this does not occur, this secondary responsibility will * ensure the lwb is issued even if there is not other synchronous * activity on the system. @@ -3545,7 +3339,7 @@ zil_commit_waiter_t *zcw = zil_alloc_commit_waiter(); zil_commit_itx_assign(zilog, zcw); - uint64_t wtxg = zil_commit_writer(zilog, zcw); + zil_commit_writer(zilog, zcw); zil_commit_waiter(zilog, zcw); if (zcw->zcw_zio_error != 0) { @@ -3560,8 +3354,6 @@ DTRACE_PROBE2(zil__commit__io__error, zilog_t *, zilog, zil_commit_waiter_t *, zcw); txg_wait_synced(zilog->zl_dmu_pool, 0); - } else if (wtxg != 0) { - txg_wait_synced(zilog->zl_dmu_pool, wtxg); } zil_free_commit_waiter(zcw); @@ -3864,7 +3656,7 @@ /* * zl_lwb_max_issued_txg may be larger than lwb_max_txg. It depends * on the time when the dmu_tx transaction is assigned in - * zil_lwb_write_close(). + * zil_lwb_write_issue(). */ mutex_enter(&zilog->zl_lwb_io_lock); txg = MAX(zilog->zl_lwb_max_issued_txg, txg); diff --git a/sys/contrib/openzfs/module/zfs/zio.c b/sys/contrib/openzfs/module/zfs/zio.c --- a/sys/contrib/openzfs/module/zfs/zio.c +++ b/sys/contrib/openzfs/module/zfs/zio.c @@ -626,6 +626,8 @@ void zio_add_child(zio_t *pio, zio_t *cio) { + zio_link_t *zl = kmem_cache_alloc(zio_link_cache, KM_SLEEP); + /* * Logical I/Os can have logical, gang, or vdev children. * Gang I/Os can have gang or vdev children. @@ -634,7 +636,6 @@ */ ASSERT3S(cio->io_child_type, <=, pio->io_child_type); - zio_link_t *zl = kmem_cache_alloc(zio_link_cache, KM_SLEEP); zl->zl_parent = pio; zl->zl_child = cio; @@ -643,45 +644,16 @@ ASSERT(pio->io_state[ZIO_WAIT_DONE] == 0); - uint64_t *countp = pio->io_children[cio->io_child_type]; for (int w = 0; w < ZIO_WAIT_TYPES; w++) - countp[w] += !cio->io_state[w]; + pio->io_children[cio->io_child_type][w] += !cio->io_state[w]; list_insert_head(&pio->io_child_list, zl); list_insert_head(&cio->io_parent_list, zl); - mutex_exit(&cio->io_lock); - mutex_exit(&pio->io_lock); -} - -void -zio_add_child_first(zio_t *pio, zio_t *cio) -{ - /* - * Logical I/Os can have logical, gang, or vdev children. - * Gang I/Os can have gang or vdev children. - * Vdev I/Os can only have vdev children. - * The following ASSERT captures all of these constraints. - */ - ASSERT3S(cio->io_child_type, <=, pio->io_child_type); - - zio_link_t *zl = kmem_cache_alloc(zio_link_cache, KM_SLEEP); - zl->zl_parent = pio; - zl->zl_child = cio; - - ASSERT(list_is_empty(&cio->io_parent_list)); - list_insert_head(&cio->io_parent_list, zl); - - mutex_enter(&pio->io_lock); - - ASSERT(pio->io_state[ZIO_WAIT_DONE] == 0); - - uint64_t *countp = pio->io_children[cio->io_child_type]; - for (int w = 0; w < ZIO_WAIT_TYPES; w++) - countp[w] += !cio->io_state[w]; - - list_insert_head(&pio->io_child_list, zl); + pio->io_child_count++; + cio->io_parent_count++; + mutex_exit(&cio->io_lock); mutex_exit(&pio->io_lock); } @@ -697,6 +669,9 @@ list_remove(&pio->io_child_list, zl); list_remove(&cio->io_parent_list, zl); + pio->io_child_count--; + cio->io_parent_count--; + mutex_exit(&cio->io_lock); mutex_exit(&pio->io_lock); kmem_cache_free(zio_link_cache, zl); @@ -871,14 +846,12 @@ zio->io_child_type = ZIO_CHILD_LOGICAL; if (bp != NULL) { + zio->io_bp = (blkptr_t *)bp; + zio->io_bp_copy = *bp; + zio->io_bp_orig = *bp; if (type != ZIO_TYPE_WRITE || - zio->io_child_type == ZIO_CHILD_DDT) { - zio->io_bp_copy = *bp; + zio->io_child_type == ZIO_CHILD_DDT) zio->io_bp = &zio->io_bp_copy; /* so caller can free */ - } else { - zio->io_bp = (blkptr_t *)bp; - } - zio->io_bp_orig = *bp; if (zio->io_child_type == ZIO_CHILD_LOGICAL) zio->io_logical = zio; if (zio->io_child_type > ZIO_CHILD_GANG && BP_IS_GANG(bp)) @@ -913,7 +886,7 @@ zio->io_logical = pio->io_logical; if (zio->io_child_type == ZIO_CHILD_GANG) zio->io_gang_leader = pio->io_gang_leader; - zio_add_child_first(pio, zio); + zio_add_child(pio, zio); } taskq_init_ent(&zio->io_tqent); @@ -1189,8 +1162,9 @@ zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, abd_t *data, uint64_t lsize, uint64_t psize, const zio_prop_t *zp, zio_done_func_t *ready, zio_done_func_t *children_ready, - zio_done_func_t *done, void *private, zio_priority_t priority, - zio_flag_t flags, const zbookmark_phys_t *zb) + zio_done_func_t *physdone, zio_done_func_t *done, + void *private, zio_priority_t priority, zio_flag_t flags, + const zbookmark_phys_t *zb) { zio_t *zio; @@ -1210,6 +1184,7 @@ zio->io_ready = ready; zio->io_children_ready = children_ready; + zio->io_physdone = physdone; zio->io_prop = *zp; /* @@ -1542,11 +1517,16 @@ flags &= ~ZIO_FLAG_IO_ALLOCATING; } + zio = zio_create(pio, pio->io_spa, pio->io_txg, bp, data, size, size, done, private, type, priority, flags, vd, offset, &pio->io_bookmark, ZIO_STAGE_VDEV_IO_START >> 1, pipeline); ASSERT3U(zio->io_child_type, ==, ZIO_CHILD_VDEV); + zio->io_physdone = pio->io_physdone; + if (vd->vdev_ops->vdev_op_leaf && zio->io_logical != NULL) + zio->io_logical->io_phys_children++; + return (zio); } @@ -1634,8 +1614,15 @@ abd_return_buf_copy(zio->io_abd, data, psize); } else { ASSERT(!BP_IS_EMBEDDED(bp)); + ASSERT3P(zio->io_bp, ==, &zio->io_bp_copy); } + if (!DMU_OT_IS_METADATA(BP_GET_TYPE(bp)) && BP_GET_LEVEL(bp) == 0) + zio->io_flags |= ZIO_FLAG_DONT_CACHE; + + if (BP_GET_TYPE(bp) == DMU_OT_DDT_ZAP) + zio->io_flags |= ZIO_FLAG_DONT_CACHE; + if (BP_GET_DEDUP(bp) && zio->io_child_type == ZIO_CHILD_LOGICAL) zio->io_pipeline = ZIO_DDT_READ_PIPELINE; @@ -2730,7 +2717,7 @@ blkptr_t *bp = zio->io_bp; ASSERT(gio == zio_unique_parent(zio)); - ASSERT(list_is_empty(&zio->io_child_list)); + ASSERT(zio->io_child_count == 0); if (zio->io_error) return; @@ -2988,7 +2975,7 @@ zio_t *cio = zio_write(zio, spa, txg, &gbh->zg_blkptr[g], has_data ? abd_get_offset(pio->io_abd, pio->io_size - resid) : NULL, lsize, lsize, &zp, - zio_write_gang_member_ready, NULL, + zio_write_gang_member_ready, NULL, NULL, zio_write_gang_done, &gn->gn_child[g], pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); @@ -3450,7 +3437,7 @@ } else { cio = zio_write(zio, spa, txg, bp, zio->io_orig_abd, zio->io_orig_size, zio->io_orig_size, zp, - zio_ddt_child_write_ready, NULL, + zio_ddt_child_write_ready, NULL, NULL, zio_ddt_child_write_done, dde, zio->io_priority, ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark); @@ -3968,6 +3955,9 @@ zio->io_type == ZIO_TYPE_WRITE || zio->io_type == ZIO_TYPE_TRIM)) { + if (zio->io_type == ZIO_TYPE_READ && vdev_cache_read(zio)) + return (zio); + if ((zio = vdev_queue_io(zio)) == NULL) return (NULL); @@ -4004,6 +3994,9 @@ vd->vdev_ops != &vdev_draid_spare_ops) { vdev_queue_io_done(zio); + if (zio->io_type == ZIO_TYPE_WRITE) + vdev_cache_write(zio); + if (zio_injection_enabled && zio->io_error == 0) zio->io_error = zio_handle_device_injections(vd, zio, EIO, EILSEQ); @@ -4113,7 +4106,8 @@ ASSERT(!(zio->io_flags & ZIO_FLAG_DONT_QUEUE)); /* not a leaf */ ASSERT(!(zio->io_flags & ZIO_FLAG_IO_BYPASS)); /* not a leaf */ zio->io_error = 0; - zio->io_flags |= ZIO_FLAG_IO_RETRY | ZIO_FLAG_DONT_AGGREGATE; + zio->io_flags |= ZIO_FLAG_IO_RETRY | + ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_AGGREGATE; zio->io_stage = ZIO_STAGE_VDEV_IO_START >> 1; zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, zio_requeue_io_start_cut_in_line); @@ -4153,6 +4147,13 @@ if (zio->io_error) zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; + if (vd != NULL && vd->vdev_ops->vdev_op_leaf && + zio->io_physdone != NULL) { + ASSERT(!(zio->io_flags & ZIO_FLAG_DELEGATED)); + ASSERT(zio->io_child_type == ZIO_CHILD_VDEV); + zio->io_physdone(zio->io_logical); + } + return (zio); } @@ -4474,10 +4475,8 @@ zio->io_ready(zio); } -#ifdef ZFS_DEBUG if (bp != NULL && bp != &zio->io_bp_copy) zio->io_bp_copy = *bp; -#endif if (zio->io_error != 0) { zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; @@ -4904,7 +4903,7 @@ return (NULL); } - ASSERT(list_is_empty(&zio->io_child_list)); + ASSERT(zio->io_child_count == 0); ASSERT(zio->io_reexecute == 0); ASSERT(zio->io_error == 0 || (zio->io_flags & ZIO_FLAG_CANFAIL)); diff --git a/sys/contrib/openzfs/module/zfs/zvol.c b/sys/contrib/openzfs/module/zfs/zvol.c --- a/sys/contrib/openzfs/module/zfs/zvol.c +++ b/sys/contrib/openzfs/module/zfs/zvol.c @@ -1203,7 +1203,8 @@ * Prefetch is completed, we can do zvol_os_create_minor * sequentially. */ - while ((job = list_remove_head(&minors_list)) != NULL) { + while ((job = list_head(&minors_list)) != NULL) { + list_remove(&minors_list, job); if (!job->error) (void) zvol_os_create_minor(job->name); kmem_strfree(job->name); @@ -1310,8 +1311,10 @@ rw_exit(&zvol_state_lock); /* Drop zvol_state_lock before calling zvol_free() */ - while ((zv = list_remove_head(&free_list)) != NULL) + while ((zv = list_head(&free_list)) != NULL) { + list_remove(&free_list, zv); zvol_os_free(zv); + } } /* Remove minor for this specific volume only */ diff --git a/sys/contrib/openzfs/tests/runfiles/common.run b/sys/contrib/openzfs/tests/runfiles/common.run --- a/sys/contrib/openzfs/tests/runfiles/common.run +++ b/sys/contrib/openzfs/tests/runfiles/common.run @@ -128,7 +128,7 @@ 'zdb_block_size_histogram', 'zdb_checksum', 'zdb_decompress', 'zdb_display_block', 'zdb_encrypted', 'zdb_label_checksum', 'zdb_object_range_neg', 'zdb_object_range_pos', 'zdb_objset_id', - 'zdb_decompress_zstd', 'zdb_recover', 'zdb_recover_2', 'zdb_backup'] + 'zdb_decompress_zstd', 'zdb_recover', 'zdb_recover_2'] pre = post = tags = ['functional', 'cli_root', 'zdb'] @@ -472,8 +472,7 @@ tags = ['functional', 'cli_root', 'zpool_replace'] [tests/functional/cli_root/zpool_resilver] -tests = ['zpool_resilver_bad_args', 'zpool_resilver_restart', - 'zpool_resilver_concurrent'] +tests = ['zpool_resilver_bad_args', 'zpool_resilver_restart'] tags = ['functional', 'cli_root', 'zpool_resilver'] [tests/functional/cli_root/zpool_scrub] diff --git a/sys/contrib/openzfs/tests/runfiles/freebsd.run b/sys/contrib/openzfs/tests/runfiles/freebsd.run --- a/sys/contrib/openzfs/tests/runfiles/freebsd.run +++ b/sys/contrib/openzfs/tests/runfiles/freebsd.run @@ -25,8 +25,3 @@ [tests/functional/cli_root/zfs_jail:FreeBSD] tests = ['zfs_jail_001_pos'] tags = ['functional', 'cli_root', 'zfs_jail'] - -[tests/functional/pam:FreeBSD] -tests = ['pam_basic', 'pam_change_unmounted', 'pam_nounmount', 'pam_recursive', - 'pam_short_password'] -tags = ['functional', 'pam'] diff --git a/sys/contrib/openzfs/tests/runfiles/linux.run b/sys/contrib/openzfs/tests/runfiles/linux.run --- a/sys/contrib/openzfs/tests/runfiles/linux.run +++ b/sys/contrib/openzfs/tests/runfiles/linux.run @@ -140,8 +140,7 @@ tags = ['functional', 'mount'] [tests/functional/pam:Linux] -tests = ['pam_basic', 'pam_change_unmounted', 'pam_nounmount', 'pam_recursive', - 'pam_short_password'] +tests = ['pam_basic', 'pam_nounmount', 'pam_short_password'] tags = ['functional', 'pam'] [tests/functional/procfs:Linux] diff --git a/sys/contrib/openzfs/tests/test-runner/bin/zts-report.py.in b/sys/contrib/openzfs/tests/test-runner/bin/zts-report.py.in --- a/sys/contrib/openzfs/tests/test-runner/bin/zts-report.py.in +++ b/sys/contrib/openzfs/tests/test-runner/bin/zts-report.py.in @@ -152,7 +152,6 @@ ['FAIL', rewind_reason], 'cli_user/misc/zfs_share_001_neg': ['SKIP', na_reason], 'cli_user/misc/zfs_unshare_001_neg': ['SKIP', na_reason], - 'pool_checkpoint/checkpoint_discard_busy': ['SKIP', 12053], 'privilege/setup': ['SKIP', na_reason], 'refreserv/refreserv_004_pos': ['FAIL', known_reason], 'rootpool/setup': ['SKIP', na_reason], @@ -164,8 +163,6 @@ known.update({ 'cli_root/zfs_receive/receive-o-x_props_override': ['FAIL', known_reason], - 'cli_root/zpool_resilver/zpool_resilver_concurrent': - ['SKIP', na_reason], 'cli_root/zpool_wait/zpool_wait_trim_basic': ['SKIP', trim_reason], 'cli_root/zpool_wait/zpool_wait_trim_cancel': ['SKIP', trim_reason], 'cli_root/zpool_wait/zpool_wait_trim_flag': ['SKIP', trim_reason], @@ -173,7 +170,6 @@ 'link_count/link_count_001': ['SKIP', na_reason], 'casenorm/mixed_create_failure': ['FAIL', 13215], 'mmap/mmap_sync_001_pos': ['SKIP', na_reason], - 'rsend/send_raw_ashift': ['SKIP', 14961], }) elif sys.platform.startswith('linux'): known.update({ @@ -281,8 +277,6 @@ 'mmp/mmp_inactive_import': ['FAIL', known_reason], 'zvol/zvol_misc/zvol_misc_snapdev': ['FAIL', 12621], 'zvol/zvol_misc/zvol_misc_volmode': ['FAIL', known_reason], - 'zvol/zvol_misc/zvol_misc_fua': ['SKIP', 14872], - 'zvol/zvol_misc/zvol_misc_trim': ['SKIP', 14872], 'idmap_mount/idmap_mount_001': ['SKIP', idmap_reason], 'idmap_mount/idmap_mount_002': ['SKIP', idmap_reason], 'idmap_mount/idmap_mount_003': ['SKIP', idmap_reason], diff --git a/sys/contrib/openzfs/tests/zfs-tests/cmd/btree_test.c b/sys/contrib/openzfs/tests/zfs-tests/cmd/btree_test.c --- a/sys/contrib/openzfs/tests/zfs-tests/cmd/btree_test.c +++ b/sys/contrib/openzfs/tests/zfs-tests/cmd/btree_test.c @@ -501,7 +501,7 @@ srandom(seed); zfs_btree_init(); - zfs_btree_create(&bt, zfs_btree_compare, NULL, sizeof (uint64_t)); + zfs_btree_create(&bt, zfs_btree_compare, sizeof (uint64_t)); /* * This runs the named negative test. None of them should diff --git a/sys/contrib/openzfs/tests/zfs-tests/include/libtest.shlib b/sys/contrib/openzfs/tests/zfs-tests/include/libtest.shlib --- a/sys/contrib/openzfs/tests/zfs-tests/include/libtest.shlib +++ b/sys/contrib/openzfs/tests/zfs-tests/include/libtest.shlib @@ -3706,7 +3706,7 @@ while $do_once || [ $stat1 -ne $stat2 ] || [ $stat2 -eq 0 ]; do typeset stat1=$(get_arcstat $stat) - sleep 0.5 + sleep 2 typeset stat2=$(get_arcstat $stat) do_once=false done diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/Makefile.am b/sys/contrib/openzfs/tests/zfs-tests/tests/Makefile.am --- a/sys/contrib/openzfs/tests/zfs-tests/tests/Makefile.am +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/Makefile.am @@ -572,7 +572,6 @@ functional/cli_root/zdb/zdb_006_pos.ksh \ functional/cli_root/zdb/zdb_args_neg.ksh \ functional/cli_root/zdb/zdb_args_pos.ksh \ - functional/cli_root/zdb/zdb_backup.ksh \ functional/cli_root/zdb/zdb_block_size_histogram.ksh \ functional/cli_root/zdb/zdb_checksum.ksh \ functional/cli_root/zdb/zdb_decompress.ksh \ @@ -1143,7 +1142,6 @@ functional/cli_root/zpool_resilver/setup.ksh \ functional/cli_root/zpool_resilver/zpool_resilver_bad_args.ksh \ functional/cli_root/zpool_resilver/zpool_resilver_restart.ksh \ - functional/cli_root/zpool_resilver/zpool_resilver_concurrent.ksh \ functional/cli_root/zpool_scrub/cleanup.ksh \ functional/cli_root/zpool_scrub/setup.ksh \ functional/cli_root/zpool_scrub/zpool_scrub_001_neg.ksh \ diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zdb/zdb_backup.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zdb/zdb_backup.ksh deleted file mode 100755 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zdb/zdb_backup.ksh +++ /dev/null @@ -1,55 +0,0 @@ -#!/bin/ksh - -# -# This file and its contents are supplied under the terms of the -# Common Development and Distribution License ("CDDL"), version 1.0. -# You may only use this file in accordance with the terms of version -# 1.0 of the CDDL. -# -# A full copy of the text of the CDDL should have accompanied this -# source. A copy of the CDDL is also available via the Internet at -# http://www.illumos.org/license/CDDL. -# - -# -# Copyright (c) 2023, Klara Inc. -# - -. $STF_SUITE/include/libtest.shlib - -write_count=8 -blksize=131072 - -tmpfile=$TEST_BASE_DIR/tmpfile - -function cleanup -{ - datasetexists $TESTPOOL && destroy_pool $TESTPOOL - rm $tmpfile.1 $tmpfile.2 -} - -log_onexit cleanup - -log_assert "Verify that zfs send and zdb -B produce the same stream" - -verify_runnable "global" -verify_disk_count "$DISKS" 2 - -default_mirror_setup_noexit $DISKS -file_write -o create -w -f $TESTDIR/file -b $blksize -c $write_count - -snap=$TESTPOOL/$TESTFS@snap -log_must zfs snapshot $snap -typeset -i objsetid=$(zfs get -Ho value objsetid $snap) - -sync_pool $TESTPOOL - -log_must eval "zfs send -ecL $snap > $tmpfile.1" -log_must eval "zdb -B $TESTPOOL/$objsetid ecL > $tmpfile.2" - -typeset sum1=$(cat $tmpfile.1 | md5sum) -typeset sum2=$(cat $tmpfile.2 | md5sum) - -log_must test "$sum1" = "$sum2" - -log_pass "zfs send and zdb -B produce the same stream" diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_resilver/zpool_resilver_concurrent.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_resilver/zpool_resilver_concurrent.ksh deleted file mode 100755 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_resilver/zpool_resilver_concurrent.ksh +++ /dev/null @@ -1,101 +0,0 @@ -#!/bin/ksh -p -# -# CDDL HEADER START -# -# The contents of this file are subject to the terms of the -# Common Development and Distribution License (the "License"). -# You may not use this file except in compliance with the License. -# -# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE -# or http://www.opensolaris.org/os/licensing. -# See the License for the specific language governing permissions -# and limitations under the License. -# -# When distributing Covered Code, include this CDDL HEADER in each -# file and include the License file at usr/src/OPENSOLARIS.LICENSE. -# If applicable, add the following below this CDDL HEADER, with the -# fields enclosed by brackets "[]" replaced with your own identifying -# information: Portions Copyright [yyyy] [name of copyright owner] -# -# CDDL HEADER END -# - -# -# Copyright (c) 2023 Hewlett Packard Enterprise Development LP. -# - -. $STF_SUITE/include/libtest.shlib -. $STF_SUITE/tests/functional/redundancy/redundancy.kshlib - -# -# DESCRIPTION: -# Verify 'zpool clear' doesn't cause concurrent resilvers -# -# STRATEGY: -# 1. Create N(10) virtual disk files. -# 2. Create draid pool based on the virtual disk files. -# 3. Fill the filesystem with directories and files. -# 4. Force-fault 2 vdevs and verify distributed spare is kicked in. -# 5. Free the distributed spare by replacing the faulty drive. -# 6. Run zpool clear and verify that it does not initiate 2 resilvers -# concurrently while distributed spare gets kicked in. -# - -verify_runnable "global" - -typeset -ir devs=10 -typeset -ir nparity=1 -typeset -ir ndata=8 -typeset -ir dspare=1 - -function cleanup -{ - poolexists "$TESTPOOL" && destroy_pool "$TESTPOOL" - - for i in {0..$devs}; do - log_must rm -f "$BASEDIR/vdev$i" - done - - for dir in $BASEDIR; do - if [[ -d $dir ]]; then - log_must rm -rf $dir - fi - done - - zed_stop - zed_cleanup -} - -log_assert "Verify zpool clear on draid pool doesn't cause concurrent resilvers" -log_onexit cleanup - -setup_test_env $TESTPOOL draid${nparity}:${ndata}d:${dspare}s $devs - -# ZED needed for sequential resilver -zed_setup -log_must zed_start - -log_must zpool offline -f $TESTPOOL $BASEDIR/vdev5 -log_must wait_vdev_state $TESTPOOL draid1-0-0 "ONLINE" 60 -log_must zpool wait -t resilver $TESTPOOL -log_must zpool offline -f $TESTPOOL $BASEDIR/vdev6 - -log_must zpool labelclear -f $BASEDIR/vdev5 -log_must zpool labelclear -f $BASEDIR/vdev6 - -log_must zpool replace -w $TESTPOOL $BASEDIR/vdev5 -sync_pool $TESTPOOL - -log_must zpool events -c -log_must zpool clear $TESTPOOL -log_must wait_vdev_state $TESTPOOL draid1-0-0 "ONLINE" 60 -log_must zpool wait -t resilver $TESTPOOL -log_must zpool wait -t scrub $TESTPOOL - -nof_resilver=$(zpool events | grep -c resilver_start) -if [ $nof_resilver = 1 ] ; then - log_must verify_pool $TESTPOOL - log_pass "zpool clear on draid pool doesn't cause concurrent resilvers" -else - log_fail "FAIL: sequential and healing resilver initiated concurrently" -fi diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_user/misc/zilstat_001_pos.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_user/misc/zilstat_001_pos.ksh --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_user/misc/zilstat_001_pos.ksh +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_user/misc/zilstat_001_pos.ksh @@ -25,7 +25,7 @@ is_freebsd && ! python3 -c 'import sysctl' 2>/dev/null && log_unsupported "python3 sysctl module missing" set -A args "" "-s \",\"" "-v" \ - "-f time,cwc,imnb,imsb" + "-f time,zcwc,zimnb,zimsb" log_assert "zilstat generates output and doesn't return an error code" diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/l2arc/persist_l2arc_001_pos.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/l2arc/persist_l2arc_001_pos.ksh --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/l2arc/persist_l2arc_001_pos.ksh +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/l2arc/persist_l2arc_001_pos.ksh @@ -27,14 +27,15 @@ # # STRATEGY: # 1. Create pool with a cache device. -# 2. Create a random file in that pool and random read for 10 sec. -# 3. Export pool. -# 4. Read the amount of log blocks written from the header of the +# 2. Export and re-import pool without writing any data. +# 3. Create a random file in that pool and random read for 10 sec. +# 4. Export pool. +# 5. Read the amount of log blocks written from the header of the # L2ARC device. -# 5. Import pool. -# 6. Read the amount of log blocks rebuilt in arcstats and compare to +# 6. Import pool. +# 7. Read the amount of log blocks rebuilt in arcstats and compare to # (5). -# 7. Check if the labels of the L2ARC device are intact. +# 8. Check if the labels of the L2ARC device are intact. # # * We can predict the minimum bytes of L2ARC restored if we subtract # from the effective size of the cache device the bytes l2arc_evict() @@ -76,8 +77,10 @@ log_must truncate -s ${cache_sz}M $VDEV_CACHE -log_must zpool create -f -o ashift=12 $TESTPOOL $VDEV -log_must zpool add $TESTPOOL cache $VDEV_CACHE +log_must zpool create -f $TESTPOOL $VDEV cache $VDEV_CACHE + +log_must zpool export $TESTPOOL +log_must zpool import -d $VDIR $TESTPOOL log_must fio $FIO_SCRIPTS/mkfiles.fio log_must fio $FIO_SCRIPTS/random_reads.fio diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/pam/cleanup.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/pam/cleanup.ksh --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/pam/cleanup.ksh +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/pam/cleanup.ksh @@ -25,6 +25,5 @@ rmconfig destroy_pool $TESTPOOL del_user ${username} -del_user ${username}rec del_group pamtestgroup log_must rm -rf "$runstatedir" $TESTDIRS diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/pam/pam_change_unmounted.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/pam/pam_change_unmounted.ksh deleted file mode 100755 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/pam/pam_change_unmounted.ksh +++ /dev/null @@ -1,55 +0,0 @@ -#!/bin/ksh -p -# -# CDDL HEADER START -# -# The contents of this file are subject to the terms of the -# Common Development and Distribution License (the "License"). -# You may not use this file except in compliance with the License. -# -# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE -# or https://opensource.org/licenses/CDDL-1.0. -# See the License for the specific language governing permissions -# and limitations under the License. -# -# When distributing Covered Code, include this CDDL HEADER in each -# file and include the License file at usr/src/OPENSOLARIS.LICENSE. -# If applicable, add the following below this CDDL HEADER, with the -# fields enclosed by brackets "[]" replaced with your own identifying -# information: Portions Copyright [yyyy] [name of copyright owner] -# -# CDDL HEADER END -# - -. $STF_SUITE/tests/functional/pam/utilities.kshlib - -if [ -n "$ASAN_OPTIONS" ]; then - export LD_PRELOAD=$(ldd "$(command -v zfs)" | awk '/libasan\.so/ {print $3}') -fi - -log_mustnot ismounted "$TESTPOOL/pam/${username}" -keystatus unavailable - -genconfig "homes=$TESTPOOL/pam runstatedir=${runstatedir}" - -printf "testpass\nsecondpass\nsecondpass\n" | pamtester -v ${pamservice} ${username} chauthtok - -log_mustnot ismounted "$TESTPOOL/pam/${username}" -keystatus unavailable - -echo "secondpass" | pamtester ${pamservice} ${username} open_session -references 1 -log_must ismounted "$TESTPOOL/pam/${username}" -keystatus available - -printf "secondpass\ntestpass\ntestpass\n" | pamtester -v ${pamservice} ${username} chauthtok - -log_must ismounted "$TESTPOOL/pam/${username}" -log_must ismounted "$TESTPOOL/pam/${username}" -keystatus available - -log_must pamtester ${pamservice} ${username} close_session -references 0 -log_mustnot ismounted "$TESTPOOL/pam/${username}" -keystatus unavailable - -log_pass "done." diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/pam/pam_recursive.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/pam/pam_recursive.ksh deleted file mode 100755 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/pam/pam_recursive.ksh +++ /dev/null @@ -1,72 +0,0 @@ -#!/bin/ksh -p -# -# CDDL HEADER START -# -# The contents of this file are subject to the terms of the -# Common Development and Distribution License (the "License"). -# You may not use this file except in compliance with the License. -# -# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE -# or https://opensource.org/licenses/CDDL-1.0. -# See the License for the specific language governing permissions -# and limitations under the License. -# -# When distributing Covered Code, include this CDDL HEADER in each -# file and include the License file at usr/src/OPENSOLARIS.LICENSE. -# If applicable, add the following below this CDDL HEADER, with the -# fields enclosed by brackets "[]" replaced with your own identifying -# information: Portions Copyright [yyyy] [name of copyright owner] -# -# CDDL HEADER END -# - -. $STF_SUITE/tests/functional/pam/utilities.kshlib - -if [ -n "$ASAN_OPTIONS" ]; then - export LD_PRELOAD=$(ldd "$(command -v zfs)" | awk '/libasan\.so/ {print $3}') -fi - -username="${username}rec" - -# Set up a deeper hierarchy, a mountpoint that doesn't interfere with other tests, -# and a user which references that mountpoint -log_must zfs create "$TESTPOOL/pampam" -log_must zfs create -o mountpoint="$TESTDIR/rec" "$TESTPOOL/pampam/pam" -echo "recurpass" | zfs create -o encryption=aes-256-gcm -o keyformat=passphrase \ - -o keylocation=prompt "$TESTPOOL/pampam/pam/${username}" -log_must zfs unmount "$TESTPOOL/pampam/pam/${username}" -log_must zfs unload-key "$TESTPOOL/pampam/pam/${username}" -log_must add_user pamtestgroup ${username} "$TESTDIR/rec" - -function keystatus { - log_must [ "$(get_prop keystatus "$TESTPOOL/pampam/pam/${username}")" = "$1" ] -} - -log_mustnot ismounted "$TESTPOOL/pampam/pam/${username}" -keystatus unavailable - -function test_session { - echo "recurpass" | pamtester ${pamservice} ${username} open_session - references 1 - log_must ismounted "$TESTPOOL/pampam/pam/${username}" - keystatus available - - log_must pamtester ${pamservice} ${username} close_session - references 0 - log_mustnot ismounted "$TESTPOOL/pampam/pam/${username}" - keystatus unavailable -} - -genconfig "homes=$TESTPOOL/pampam/pam prop_mountpoint runstatedir=${runstatedir}" -test_session - -genconfig "homes=$TESTPOOL/pampam recursive_homes prop_mountpoint runstatedir=${runstatedir}" -test_session - -genconfig "homes=$TESTPOOL recursive_homes prop_mountpoint runstatedir=${runstatedir}" -test_session - -genconfig "homes=* recursive_homes prop_mountpoint runstatedir=${runstatedir}" -test_session - -log_pass "done." diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/pam/pam_short_password.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/pam/pam_short_password.ksh --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/pam/pam_short_password.ksh +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/pam/pam_short_password.ksh @@ -52,7 +52,7 @@ keystatus available # Change user and dataset password to short one. -printf "testpass\nshort\nshort\n" | pamtester -v ${pamservice} ${username} chauthtok +printf "short\nshort\n" | pamtester ${pamservice} ${username} chauthtok # Unmount and unload key. log_must pamtester ${pamservice} ${username} close_session diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/pool_checkpoint/checkpoint_discard_busy.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/pool_checkpoint/checkpoint_discard_busy.ksh --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/pool_checkpoint/checkpoint_discard_busy.ksh +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/pool_checkpoint/checkpoint_discard_busy.ksh @@ -38,8 +38,6 @@ verify_runnable "global" -log_unsupported "Skipping, issue https://github.com/openzfs/zfs/issues/12053" - function test_cleanup { # reset memory limit to 16M diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/rsend/send_raw_ashift.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/rsend/send_raw_ashift.ksh --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/rsend/send_raw_ashift.ksh +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/rsend/send_raw_ashift.ksh @@ -37,10 +37,6 @@ log_assert "Verify raw sending to pools with greater ashift succeeds" -if is_freebsd; then - log_unsupported "Runs too long on FreeBSD 14 (Issue #14961)" -fi - function cleanup { rm -f $BACKDIR/fs@* diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/zvol/zvol_misc/zvol_misc_fua.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/zvol/zvol_misc/zvol_misc_fua.ksh --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/zvol/zvol_misc/zvol_misc_fua.ksh +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/zvol/zvol_misc/zvol_misc_fua.ksh @@ -45,15 +45,6 @@ if ! is_linux ; then log_unsupported "Only linux supports dd with oflag=dsync for FUA writes" -else - if [[ $(linux_version) -gt $(linux_version "6.2") ]]; then - log_unsupported "Disabled while issue #14872 is being worked" - fi - - # Disabled for the CentOS 9 kernel - if [[ $(linux_version) -eq $(linux_version "5.14") ]]; then - log_unsupported "Disabled while issue #14872 is being worked" - fi fi typeset datafile1="$(mktemp zvol_misc_fua1.XXXXXX)" diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/zvol/zvol_misc/zvol_misc_trim.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/zvol/zvol_misc/zvol_misc_trim.ksh --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/zvol/zvol_misc/zvol_misc_trim.ksh +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/zvol/zvol_misc/zvol_misc_trim.ksh @@ -44,15 +44,6 @@ verify_runnable "global" if is_linux ; then - if [[ $(linux_version) -gt $(linux_version "6.2") ]]; then - log_unsupported "Disabled while issue #14872 is being worked" - fi - - # Disabled for the CentOS 9 kernel - if [[ $(linux_version) -eq $(linux_version "5.14") ]]; then - log_unsupported "Disabled while issue #14872 is being worked" - fi - # We need '--force' here since the prior tests may leave a filesystem # on the zvol, and blkdiscard will see that filesystem and print a # warning unless you force it. @@ -132,6 +123,7 @@ # Remove old data from previous tests log_must $trimcmd $zvolpath + set_blk_mq 1 log_must_busy zpool export $TESTPOOL log_must zpool import $TESTPOOL diff --git a/sys/modules/zfs/Makefile b/sys/modules/zfs/Makefile --- a/sys/modules/zfs/Makefile +++ b/sys/modules/zfs/Makefile @@ -38,7 +38,7 @@ CFLAGS+= -D__KERNEL__ -DFREEBSD_NAMECACHE -DBUILDING_ZFS \ -DHAVE_UIO_ZEROCOPY -DWITHOUT_NETDUMP -D__KERNEL -D_SYS_CONDVAR_H_ \ - -D_SYS_VMEM_H_ + -D_SYS_VMEM_H_ -DIN_FREEBSD_BASE .if ${MACHINE_ARCH} == "amd64" CFLAGS+= -D__x86_64 -DHAVE_SSE2 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 \ @@ -295,6 +295,7 @@ uberblock.c \ unique.c \ vdev.c \ + vdev_cache.c \ vdev_draid.c \ vdev_draid_rand.c \ vdev_indirect.c \ diff --git a/sys/modules/zfs/zfs_config.h b/sys/modules/zfs/zfs_config.h --- a/sys/modules/zfs/zfs_config.h +++ b/sys/modules/zfs/zfs_config.h @@ -653,9 +653,6 @@ /* qat is enabled and existed */ /* #undef HAVE_QAT */ -/* struct reclaim_state has reclaimed */ -/* #undef HAVE_RECLAIM_STATE_RECLAIMED */ - /* register_shrinker is vararg */ /* #undef HAVE_REGISTER_SHRINKER_VARARG */ @@ -1051,7 +1048,7 @@ /* #undef ZFS_IS_GPL_COMPATIBLE */ /* Define the project alias string. */ -#define ZFS_META_ALIAS "zfs-2.2.0-FreeBSD_g009d3288" +#define ZFS_META_ALIAS "zfs-2.1.99-FreeBSD_gad0a55461" /* Define the project author. */ #define ZFS_META_AUTHOR "OpenZFS" @@ -1060,7 +1057,7 @@ /* #undef ZFS_META_DATA */ /* Define the maximum compatible kernel version. */ -#define ZFS_META_KVER_MAX "6.3" +#define ZFS_META_KVER_MAX "6.2" /* Define the minimum compatible kernel version. */ #define ZFS_META_KVER_MIN "3.10" @@ -1081,10 +1078,10 @@ #define ZFS_META_NAME "zfs" /* Define the project release. */ -#define ZFS_META_RELEASE "FreeBSD_g009d3288" +#define ZFS_META_RELEASE "FreeBSD_gad0a55461" /* Define the project version. */ -#define ZFS_META_VERSION "2.2.0" +#define ZFS_META_VERSION "2.1.99" /* count is located in percpu_ref.data */ /* #undef ZFS_PERCPU_REF_COUNT_IN_DATA */ diff --git a/sys/modules/zfs/zfs_gitrev.h b/sys/modules/zfs/zfs_gitrev.h --- a/sys/modules/zfs/zfs_gitrev.h +++ b/sys/modules/zfs/zfs_gitrev.h @@ -1 +1 @@ -#define ZFS_META_GITREV "zfs-2.2.0-rc1-0-g009d3288d" +#define ZFS_META_GITREV "zfs-2.1.99-1955-gad0a55461"