Index: sys/dev/nvd/nvd.c =================================================================== --- sys/dev/nvd/nvd.c +++ sys/dev/nvd/nvd.c @@ -279,7 +279,7 @@ disk->d_sectorsize = nvme_ns_get_sector_size(ns); disk->d_mediasize = (off_t)nvme_ns_get_size(ns); disk->d_delmaxsize = (off_t)nvme_ns_get_size(ns); - disk->d_stripesize = nvme_ns_get_stripesize(ns); + disk->d_stripesize = nvme_ns_get_optimal_sector_size(ns); if (TAILQ_EMPTY(&disk_head)) disk->d_unit = 0; Index: sys/dev/nvme/nvme.h =================================================================== --- sys/dev/nvme/nvme.h +++ sys/dev/nvme/nvme.h @@ -870,6 +870,7 @@ const char * nvme_ns_get_model_number(struct nvme_namespace *ns); const struct nvme_namespace_data * nvme_ns_get_data(struct nvme_namespace *ns); +uint32_t nvme_ns_get_optimal_sector_size(struct nvme_namespace *ns); uint32_t nvme_ns_get_stripesize(struct nvme_namespace *ns); int nvme_ns_bio_process(struct nvme_namespace *ns, struct bio *bp, Index: sys/dev/nvme/nvme_ns.c =================================================================== --- sys/dev/nvme/nvme_ns.c +++ sys/dev/nvme/nvme_ns.c @@ -45,6 +45,8 @@ #include "nvme_private.h" +extern int nvme_max_optimal_sectorsize; + static void nvme_bio_child_inbed(struct bio *parent, int bio_error); static void nvme_bio_child_done(void *arg, const struct nvme_completion *cpl); @@ -217,6 +219,22 @@ return (ns->stripesize); } +uint32_t +nvme_ns_get_optimal_sector_size(struct nvme_namespace *ns) +{ + uint32_t stripesize; + + stripesize = nvme_ns_get_stripesize(ns); + + if (stripesize == 0) + return nvme_ns_get_sector_size(ns); + + if (nvme_max_optimal_sectorsize == 0) + return (stripesize); + + return (MIN(stripesize, nvme_max_optimal_sectorsize)); +} + static void nvme_ns_bio_done(void *arg, const struct nvme_completion *status) { Index: sys/dev/nvme/nvme_sysctl.c =================================================================== --- sys/dev/nvme/nvme_sysctl.c +++ sys/dev/nvme/nvme_sysctl.c @@ -33,6 +33,22 @@ #include "nvme_private.h" +SYSCTL_NODE(_kern, OID_AUTO, nvme, CTLFLAG_RD, 0, "NVM Express"); +/* + * Intel NVMe controllers have a slow path for I/Os that span a 128KB + * stripe boundary but ZFS limits ashift, which is derived from + * d_stripesize, to 13 (8KB) so we limit the stripesize reported to + * geom(8) to 4KB by default. + * + * This may result in a small number of additional I/Os to require + * splitting in nvme(4), however the NVMe I/O path is very efficient + * so these additional I/Os will cause very minimal (if any) difference + * in performance or CPU utilisation. + */ +int nvme_max_optimal_sectorsize = 1<<12; +SYSCTL_INT(_kern_nvme, OID_AUTO, max_optimal_sectorsize, CTLFLAG_RWTUN, + &nvme_max_optimal_sectorsize, 0, "The maximum optimal sectorsize reported"); + /* * CTLTYPE_S64 and sysctl_handle_64 were added in r217616. Define these * explicitly here for older kernels that don't include the r217616