Index: head/sys/cam/ata/ata_da.c =================================================================== --- head/sys/cam/ata/ata_da.c (revision 329818) +++ head/sys/cam/ata/ata_da.c (revision 329819) @@ -1,3584 +1,3584 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2009 Alexander Motin * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer, * without modification, immediately at the beginning of the file. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_ada.h" #include #ifdef _KERNEL #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #endif /* _KERNEL */ #ifndef _KERNEL #include #include #endif /* _KERNEL */ #include #include #include #include #include #include #include #include #include #include /* geometry translation */ #ifdef _KERNEL #define ATA_MAX_28BIT_LBA 268435455UL extern int iosched_debug; typedef enum { ADA_STATE_RAHEAD, ADA_STATE_WCACHE, ADA_STATE_LOGDIR, ADA_STATE_IDDIR, ADA_STATE_SUP_CAP, ADA_STATE_ZONE, ADA_STATE_NORMAL } ada_state; typedef enum { ADA_FLAG_CAN_48BIT = 0x00000002, ADA_FLAG_CAN_FLUSHCACHE = 0x00000004, ADA_FLAG_CAN_NCQ = 0x00000008, ADA_FLAG_CAN_DMA = 0x00000010, ADA_FLAG_NEED_OTAG = 0x00000020, ADA_FLAG_WAS_OTAG = 0x00000040, ADA_FLAG_CAN_TRIM = 0x00000080, ADA_FLAG_OPEN = 0x00000100, ADA_FLAG_SCTX_INIT = 0x00000200, ADA_FLAG_CAN_CFA = 0x00000400, ADA_FLAG_CAN_POWERMGT = 0x00000800, ADA_FLAG_CAN_DMA48 = 0x00001000, ADA_FLAG_CAN_LOG = 0x00002000, ADA_FLAG_CAN_IDLOG = 0x00004000, ADA_FLAG_CAN_SUPCAP = 0x00008000, ADA_FLAG_CAN_ZONE = 0x00010000, ADA_FLAG_CAN_WCACHE = 0x00020000, ADA_FLAG_CAN_RAHEAD = 0x00040000, ADA_FLAG_PROBED = 0x00080000, ADA_FLAG_ANNOUNCED = 0x00100000, ADA_FLAG_DIRTY = 0x00200000, ADA_FLAG_CAN_NCQ_TRIM = 0x00400000, /* CAN_TRIM also set */ ADA_FLAG_PIM_ATA_EXT = 0x00800000 } ada_flags; typedef enum { ADA_Q_NONE = 0x00, ADA_Q_4K = 0x01, ADA_Q_NCQ_TRIM_BROKEN = 0x02, ADA_Q_LOG_BROKEN = 0x04, ADA_Q_SMR_DM = 0x08 } ada_quirks; #define ADA_Q_BIT_STRING \ "\020" \ "\0014K" \ "\002NCQ_TRIM_BROKEN" \ "\003LOG_BROKEN" \ "\004SMR_DM" typedef enum { ADA_CCB_RAHEAD = 0x01, ADA_CCB_WCACHE = 0x02, ADA_CCB_BUFFER_IO = 0x03, ADA_CCB_DUMP = 0x05, ADA_CCB_TRIM = 0x06, ADA_CCB_LOGDIR = 0x07, ADA_CCB_IDDIR = 0x08, ADA_CCB_SUP_CAP = 0x09, ADA_CCB_ZONE = 0x0a, ADA_CCB_TYPE_MASK = 0x0F, } ada_ccb_state; typedef enum { ADA_ZONE_NONE = 0x00, ADA_ZONE_DRIVE_MANAGED = 0x01, ADA_ZONE_HOST_AWARE = 0x02, ADA_ZONE_HOST_MANAGED = 0x03 } ada_zone_mode; typedef enum { ADA_ZONE_FLAG_RZ_SUP = 0x0001, ADA_ZONE_FLAG_OPEN_SUP = 0x0002, ADA_ZONE_FLAG_CLOSE_SUP = 0x0004, ADA_ZONE_FLAG_FINISH_SUP = 0x0008, ADA_ZONE_FLAG_RWP_SUP = 0x0010, ADA_ZONE_FLAG_SUP_MASK = (ADA_ZONE_FLAG_RZ_SUP | ADA_ZONE_FLAG_OPEN_SUP | ADA_ZONE_FLAG_CLOSE_SUP | ADA_ZONE_FLAG_FINISH_SUP | ADA_ZONE_FLAG_RWP_SUP), ADA_ZONE_FLAG_URSWRZ = 0x0020, ADA_ZONE_FLAG_OPT_SEQ_SET = 0x0040, ADA_ZONE_FLAG_OPT_NONSEQ_SET = 0x0080, ADA_ZONE_FLAG_MAX_SEQ_SET = 0x0100, ADA_ZONE_FLAG_SET_MASK = (ADA_ZONE_FLAG_OPT_SEQ_SET | ADA_ZONE_FLAG_OPT_NONSEQ_SET | ADA_ZONE_FLAG_MAX_SEQ_SET) } ada_zone_flags; static struct ada_zone_desc { ada_zone_flags value; const char *desc; } ada_zone_desc_table[] = { {ADA_ZONE_FLAG_RZ_SUP, "Report Zones" }, {ADA_ZONE_FLAG_OPEN_SUP, "Open" }, {ADA_ZONE_FLAG_CLOSE_SUP, "Close" }, {ADA_ZONE_FLAG_FINISH_SUP, "Finish" }, {ADA_ZONE_FLAG_RWP_SUP, "Reset Write Pointer" }, }; /* Offsets into our private area for storing information */ #define ccb_state ppriv_field0 #define ccb_bp ppriv_ptr1 typedef enum { ADA_DELETE_NONE, ADA_DELETE_DISABLE, ADA_DELETE_CFA_ERASE, ADA_DELETE_DSM_TRIM, ADA_DELETE_NCQ_DSM_TRIM, ADA_DELETE_MIN = ADA_DELETE_CFA_ERASE, ADA_DELETE_MAX = ADA_DELETE_NCQ_DSM_TRIM, } ada_delete_methods; static const char *ada_delete_method_names[] = { "NONE", "DISABLE", "CFA_ERASE", "DSM_TRIM", "NCQ_DSM_TRIM" }; #if 0 static const char *ada_delete_method_desc[] = { "NONE", "DISABLED", "CFA Erase", "DSM Trim", "DSM Trim via NCQ" }; #endif struct disk_params { u_int8_t heads; u_int8_t secs_per_track; u_int32_t cylinders; u_int32_t secsize; /* Number of bytes/logical sector */ u_int64_t sectors; /* Total number sectors */ }; #define TRIM_MAX_BLOCKS 8 #define TRIM_MAX_RANGES (TRIM_MAX_BLOCKS * ATA_DSM_BLK_RANGES) struct trim_request { uint8_t data[TRIM_MAX_RANGES * ATA_DSM_RANGE_SIZE]; TAILQ_HEAD(, bio) bps; }; struct ada_softc { struct cam_iosched_softc *cam_iosched; int outstanding_cmds; /* Number of active commands */ int refcount; /* Active xpt_action() calls */ ada_state state; ada_flags flags; ada_zone_mode zone_mode; ada_zone_flags zone_flags; struct ata_gp_log_dir ata_logdir; int valid_logdir_len; struct ata_identify_log_pages ata_iddir; int valid_iddir_len; uint64_t optimal_seq_zones; uint64_t optimal_nonseq_zones; uint64_t max_seq_zones; ada_quirks quirks; ada_delete_methods delete_method; int trim_max_ranges; int read_ahead; int write_cache; int unmappedio; int rotating; #ifdef ADA_TEST_FAILURE int force_read_error; int force_write_error; int periodic_read_error; int periodic_read_count; #endif struct disk_params params; struct disk *disk; struct task sysctl_task; struct sysctl_ctx_list sysctl_ctx; struct sysctl_oid *sysctl_tree; struct callout sendordered_c; struct trim_request trim_req; #ifdef CAM_IO_STATS struct sysctl_ctx_list sysctl_stats_ctx; struct sysctl_oid *sysctl_stats_tree; u_int timeouts; u_int errors; u_int invalidations; #endif #define ADA_ANNOUNCETMP_SZ 80 char announce_temp[ADA_ANNOUNCETMP_SZ]; #define ADA_ANNOUNCE_SZ 400 char announce_buffer[ADA_ANNOUNCE_SZ]; }; struct ada_quirk_entry { struct scsi_inquiry_pattern inq_pat; ada_quirks quirks; }; static struct ada_quirk_entry ada_quirk_table[] = { { /* Hitachi Advanced Format (4k) drives */ { T_DIRECT, SIP_MEDIA_FIXED, "*", "Hitachi H??????????E3*", "*" }, /*quirks*/ADA_Q_4K }, { /* Samsung Advanced Format (4k) drives */ { T_DIRECT, SIP_MEDIA_FIXED, "*", "SAMSUNG HD155UI*", "*" }, /*quirks*/ADA_Q_4K }, { /* Samsung Advanced Format (4k) drives */ { T_DIRECT, SIP_MEDIA_FIXED, "*", "SAMSUNG HD204UI*", "*" }, /*quirks*/ADA_Q_4K }, { /* Seagate Barracuda Green Advanced Format (4k) drives */ { T_DIRECT, SIP_MEDIA_FIXED, "*", "ST????DL*", "*" }, /*quirks*/ADA_Q_4K }, { /* Seagate Barracuda Advanced Format (4k) drives */ { T_DIRECT, SIP_MEDIA_FIXED, "*", "ST???DM*", "*" }, /*quirks*/ADA_Q_4K }, { /* Seagate Barracuda Advanced Format (4k) drives */ { T_DIRECT, SIP_MEDIA_FIXED, "*", "ST????DM*", "*" }, /*quirks*/ADA_Q_4K }, { /* Seagate Momentus Advanced Format (4k) drives */ { T_DIRECT, SIP_MEDIA_FIXED, "*", "ST9500423AS*", "*" }, /*quirks*/ADA_Q_4K }, { /* Seagate Momentus Advanced Format (4k) drives */ { T_DIRECT, SIP_MEDIA_FIXED, "*", "ST9500424AS*", "*" }, /*quirks*/ADA_Q_4K }, { /* Seagate Momentus Advanced Format (4k) drives */ { T_DIRECT, SIP_MEDIA_FIXED, "*", "ST9640423AS*", "*" }, /*quirks*/ADA_Q_4K }, { /* Seagate Momentus Advanced Format (4k) drives */ { T_DIRECT, SIP_MEDIA_FIXED, "*", "ST9640424AS*", "*" }, /*quirks*/ADA_Q_4K }, { /* Seagate Momentus Advanced Format (4k) drives */ { T_DIRECT, SIP_MEDIA_FIXED, "*", "ST9750420AS*", "*" }, /*quirks*/ADA_Q_4K }, { /* Seagate Momentus Advanced Format (4k) drives */ { T_DIRECT, SIP_MEDIA_FIXED, "*", "ST9750422AS*", "*" }, /*quirks*/ADA_Q_4K }, { /* Seagate Momentus Advanced Format (4k) drives */ { T_DIRECT, SIP_MEDIA_FIXED, "*", "ST9750423AS*", "*" }, /*quirks*/ADA_Q_4K }, { /* Seagate Momentus Thin Advanced Format (4k) drives */ { T_DIRECT, SIP_MEDIA_FIXED, "*", "ST???LT*", "*" }, /*quirks*/ADA_Q_4K }, { /* WDC Caviar Red Advanced Format (4k) drives */ { T_DIRECT, SIP_MEDIA_FIXED, "*", "WDC WD????CX*", "*" }, /*quirks*/ADA_Q_4K }, { /* WDC Caviar Green Advanced Format (4k) drives */ { T_DIRECT, SIP_MEDIA_FIXED, "*", "WDC WD????RS*", "*" }, /*quirks*/ADA_Q_4K }, { /* WDC Caviar Green/Red Advanced Format (4k) drives */ { T_DIRECT, SIP_MEDIA_FIXED, "*", "WDC WD????RX*", "*" }, /*quirks*/ADA_Q_4K }, { /* WDC Caviar Red Advanced Format (4k) drives */ { T_DIRECT, SIP_MEDIA_FIXED, "*", "WDC WD??????CX*", "*" }, /*quirks*/ADA_Q_4K }, { /* WDC Caviar Black Advanced Format (4k) drives */ { T_DIRECT, SIP_MEDIA_FIXED, "*", "WDC WD????AZEX*", "*" }, /*quirks*/ADA_Q_4K }, { /* WDC Caviar Black Advanced Format (4k) drives */ { T_DIRECT, SIP_MEDIA_FIXED, "*", "WDC WD????FZEX*", "*" }, /*quirks*/ADA_Q_4K }, { /* WDC Caviar Green Advanced Format (4k) drives */ { T_DIRECT, SIP_MEDIA_FIXED, "*", "WDC WD??????RS*", "*" }, /*quirks*/ADA_Q_4K }, { /* WDC Caviar Green Advanced Format (4k) drives */ { T_DIRECT, SIP_MEDIA_FIXED, "*", "WDC WD??????RX*", "*" }, /*quirks*/ADA_Q_4K }, { /* WDC Scorpio Black Advanced Format (4k) drives */ { T_DIRECT, SIP_MEDIA_FIXED, "*", "WDC WD???PKT*", "*" }, /*quirks*/ADA_Q_4K }, { /* WDC Scorpio Black Advanced Format (4k) drives */ { T_DIRECT, SIP_MEDIA_FIXED, "*", "WDC WD?????PKT*", "*" }, /*quirks*/ADA_Q_4K }, { /* WDC Scorpio Blue Advanced Format (4k) drives */ { T_DIRECT, SIP_MEDIA_FIXED, "*", "WDC WD???PVT*", "*" }, /*quirks*/ADA_Q_4K }, { /* WDC Scorpio Blue Advanced Format (4k) drives */ { T_DIRECT, SIP_MEDIA_FIXED, "*", "WDC WD?????PVT*", "*" }, /*quirks*/ADA_Q_4K }, /* SSDs */ { /* * Corsair Force 2 SSDs * 4k optimised & trim only works in 4k requests + 4k aligned */ { T_DIRECT, SIP_MEDIA_FIXED, "*", "Corsair CSSD-F*", "*" }, /*quirks*/ADA_Q_4K }, { /* * Corsair Force 3 SSDs * 4k optimised & trim only works in 4k requests + 4k aligned */ { T_DIRECT, SIP_MEDIA_FIXED, "*", "Corsair Force 3*", "*" }, /*quirks*/ADA_Q_4K }, { /* * Corsair Neutron GTX SSDs * 4k optimised & trim only works in 4k requests + 4k aligned */ { T_DIRECT, SIP_MEDIA_FIXED, "*", "Corsair Neutron GTX*", "*" }, /*quirks*/ADA_Q_4K }, { /* * Corsair Force GT & GS SSDs * 4k optimised & trim only works in 4k requests + 4k aligned */ { T_DIRECT, SIP_MEDIA_FIXED, "*", "Corsair Force G*", "*" }, /*quirks*/ADA_Q_4K }, { /* * Crucial M4 SSDs * 4k optimised & trim only works in 4k requests + 4k aligned */ { T_DIRECT, SIP_MEDIA_FIXED, "*", "M4-CT???M4SSD2*", "*" }, /*quirks*/ADA_Q_4K }, { /* * Crucial M500 SSDs MU07 firmware * NCQ Trim works */ { T_DIRECT, SIP_MEDIA_FIXED, "*", "Crucial CT*M500*", "MU07" }, /*quirks*/0 }, { /* * Crucial M500 SSDs all other firmware * NCQ Trim doesn't work */ { T_DIRECT, SIP_MEDIA_FIXED, "*", "Crucial CT*M500*", "*" }, /*quirks*/ADA_Q_NCQ_TRIM_BROKEN }, { /* * Crucial M550 SSDs * NCQ Trim doesn't work, but only on MU01 firmware */ { T_DIRECT, SIP_MEDIA_FIXED, "*", "Crucial CT*M550*", "MU01" }, /*quirks*/ADA_Q_NCQ_TRIM_BROKEN }, { /* * Crucial MX100 SSDs * NCQ Trim doesn't work, but only on MU01 firmware */ { T_DIRECT, SIP_MEDIA_FIXED, "*", "Crucial CT*MX100*", "MU01" }, /*quirks*/ADA_Q_NCQ_TRIM_BROKEN }, { /* * Crucial RealSSD C300 SSDs * 4k optimised */ { T_DIRECT, SIP_MEDIA_FIXED, "*", "C300-CTFDDAC???MAG*", "*" }, /*quirks*/ADA_Q_4K }, { /* * FCCT M500 SSDs * NCQ Trim doesn't work */ { T_DIRECT, SIP_MEDIA_FIXED, "*", "FCCT*M500*", "*" }, /*quirks*/ADA_Q_NCQ_TRIM_BROKEN }, { /* * Intel 320 Series SSDs * 4k optimised & trim only works in 4k requests + 4k aligned */ { T_DIRECT, SIP_MEDIA_FIXED, "*", "INTEL SSDSA2CW*", "*" }, /*quirks*/ADA_Q_4K }, { /* * Intel 330 Series SSDs * 4k optimised & trim only works in 4k requests + 4k aligned */ { T_DIRECT, SIP_MEDIA_FIXED, "*", "INTEL SSDSC2CT*", "*" }, /*quirks*/ADA_Q_4K }, { /* * Intel 510 Series SSDs * 4k optimised & trim only works in 4k requests + 4k aligned */ { T_DIRECT, SIP_MEDIA_FIXED, "*", "INTEL SSDSC2MH*", "*" }, /*quirks*/ADA_Q_4K }, { /* * Intel 520 Series SSDs * 4k optimised & trim only works in 4k requests + 4k aligned */ { T_DIRECT, SIP_MEDIA_FIXED, "*", "INTEL SSDSC2BW*", "*" }, /*quirks*/ADA_Q_4K }, { /* * Intel S3610 Series SSDs * 4k optimised & trim only works in 4k requests + 4k aligned */ { T_DIRECT, SIP_MEDIA_FIXED, "*", "INTEL SSDSC2BX*", "*" }, /*quirks*/ADA_Q_4K }, { /* * Intel X25-M Series SSDs * 4k optimised & trim only works in 4k requests + 4k aligned */ { T_DIRECT, SIP_MEDIA_FIXED, "*", "INTEL SSDSA2M*", "*" }, /*quirks*/ADA_Q_4K }, { /* * Kingston E100 Series SSDs * 4k optimised & trim only works in 4k requests + 4k aligned */ { T_DIRECT, SIP_MEDIA_FIXED, "*", "KINGSTON SE100S3*", "*" }, /*quirks*/ADA_Q_4K }, { /* * Kingston HyperX 3k SSDs * 4k optimised & trim only works in 4k requests + 4k aligned */ { T_DIRECT, SIP_MEDIA_FIXED, "*", "KINGSTON SH103S3*", "*" }, /*quirks*/ADA_Q_4K }, { /* * Marvell SSDs (entry taken from OpenSolaris) * 4k optimised & trim only works in 4k requests + 4k aligned */ { T_DIRECT, SIP_MEDIA_FIXED, "*", "MARVELL SD88SA02*", "*" }, /*quirks*/ADA_Q_4K }, { /* * Micron M500 SSDs firmware MU07 * NCQ Trim works? */ { T_DIRECT, SIP_MEDIA_FIXED, "*", "Micron M500*", "MU07" }, /*quirks*/0 }, { /* * Micron M500 SSDs all other firmware * NCQ Trim doesn't work */ { T_DIRECT, SIP_MEDIA_FIXED, "*", "Micron M500*", "*" }, /*quirks*/ADA_Q_NCQ_TRIM_BROKEN }, { /* * Micron M5[15]0 SSDs * NCQ Trim doesn't work, but only MU01 firmware */ { T_DIRECT, SIP_MEDIA_FIXED, "*", "Micron M5[15]0*", "MU01" }, /*quirks*/ADA_Q_NCQ_TRIM_BROKEN }, { /* * Micron 5100 SSDs * 4k optimised & trim only works in 4k requests + 4k aligned */ { T_DIRECT, SIP_MEDIA_FIXED, "*", "Micron 5100 MTFDDAK*", "*" }, /*quirks*/ADA_Q_4K }, { /* * OCZ Agility 2 SSDs * 4k optimised & trim only works in 4k requests + 4k aligned */ { T_DIRECT, SIP_MEDIA_FIXED, "*", "OCZ-AGILITY2*", "*" }, /*quirks*/ADA_Q_4K }, { /* * OCZ Agility 3 SSDs * 4k optimised & trim only works in 4k requests + 4k aligned */ { T_DIRECT, SIP_MEDIA_FIXED, "*", "OCZ-AGILITY3*", "*" }, /*quirks*/ADA_Q_4K }, { /* * OCZ Deneva R Series SSDs * 4k optimised & trim only works in 4k requests + 4k aligned */ { T_DIRECT, SIP_MEDIA_FIXED, "*", "DENRSTE251M45*", "*" }, /*quirks*/ADA_Q_4K }, { /* * OCZ Vertex 2 SSDs (inc pro series) * 4k optimised & trim only works in 4k requests + 4k aligned */ { T_DIRECT, SIP_MEDIA_FIXED, "*", "OCZ?VERTEX2*", "*" }, /*quirks*/ADA_Q_4K }, { /* * OCZ Vertex 3 SSDs * 4k optimised & trim only works in 4k requests + 4k aligned */ { T_DIRECT, SIP_MEDIA_FIXED, "*", "OCZ-VERTEX3*", "*" }, /*quirks*/ADA_Q_4K }, { /* * OCZ Vertex 4 SSDs * 4k optimised & trim only works in 4k requests + 4k aligned */ { T_DIRECT, SIP_MEDIA_FIXED, "*", "OCZ-VERTEX4*", "*" }, /*quirks*/ADA_Q_4K }, { /* * Samsung 750 SSDs * 4k optimised, NCQ TRIM seems to work */ { T_DIRECT, SIP_MEDIA_FIXED, "*", "Samsung SSD 750*", "*" }, /*quirks*/ADA_Q_4K }, { /* * Samsung 830 Series SSDs * 4k optimised, NCQ TRIM Broken (normal TRIM is fine) */ { T_DIRECT, SIP_MEDIA_FIXED, "*", "SAMSUNG SSD 830 Series*", "*" }, /*quirks*/ADA_Q_4K | ADA_Q_NCQ_TRIM_BROKEN }, { /* * Samsung 840 SSDs * 4k optimised, NCQ TRIM Broken (normal TRIM is fine) */ { T_DIRECT, SIP_MEDIA_FIXED, "*", "Samsung SSD 840*", "*" }, /*quirks*/ADA_Q_4K | ADA_Q_NCQ_TRIM_BROKEN }, { /* * Samsung 845 SSDs * 4k optimised, NCQ TRIM Broken (normal TRIM is fine) */ { T_DIRECT, SIP_MEDIA_FIXED, "*", "Samsung SSD 845*", "*" }, /*quirks*/ADA_Q_4K | ADA_Q_NCQ_TRIM_BROKEN }, { /* * Samsung 850 SSDs * 4k optimised, NCQ TRIM broken (normal TRIM fine) */ { T_DIRECT, SIP_MEDIA_FIXED, "*", "Samsung SSD 850*", "*" }, /*quirks*/ADA_Q_4K | ADA_Q_NCQ_TRIM_BROKEN }, { /* * Samsung SM863 Series SSDs (MZ7KM*) * 4k optimised, NCQ believed to be working */ { T_DIRECT, SIP_MEDIA_FIXED, "*", "SAMSUNG MZ7KM*", "*" }, /*quirks*/ADA_Q_4K }, { /* * Samsung 843T Series SSDs (MZ7WD*) * Samsung PM851 Series SSDs (MZ7TE*) * Samsung PM853T Series SSDs (MZ7GE*) * 4k optimised, NCQ believed to be broken since these are * appear to be built with the same controllers as the 840/850. */ { T_DIRECT, SIP_MEDIA_FIXED, "*", "SAMSUNG MZ7*", "*" }, /*quirks*/ADA_Q_4K | ADA_Q_NCQ_TRIM_BROKEN }, { /* * Same as for SAMSUNG MZ7* but enable the quirks for SSD * starting with MZ7* too */ { T_DIRECT, SIP_MEDIA_FIXED, "*", "MZ7*", "*" }, /*quirks*/ADA_Q_4K | ADA_Q_NCQ_TRIM_BROKEN }, { /* * Samsung PM851 Series SSDs Dell OEM * device model "SAMSUNG SSD PM851 mSATA 256GB" * 4k optimised, NCQ broken */ { T_DIRECT, SIP_MEDIA_FIXED, "*", "SAMSUNG SSD PM851*", "*" }, /*quirks*/ADA_Q_4K | ADA_Q_NCQ_TRIM_BROKEN }, { /* * SuperTalent TeraDrive CT SSDs * 4k optimised & trim only works in 4k requests + 4k aligned */ { T_DIRECT, SIP_MEDIA_FIXED, "*", "FTM??CT25H*", "*" }, /*quirks*/ADA_Q_4K }, { /* * XceedIOPS SATA SSDs * 4k optimised */ { T_DIRECT, SIP_MEDIA_FIXED, "*", "SG9XCS2D*", "*" }, /*quirks*/ADA_Q_4K }, { /* * Samsung drive that doesn't support READ LOG EXT or * READ LOG DMA EXT, despite reporting that it does in * ATA identify data: * SAMSUNG HD200HJ KF100-06 */ { T_DIRECT, SIP_MEDIA_FIXED, "*", "SAMSUNG HD200*", "*" }, /*quirks*/ADA_Q_LOG_BROKEN }, { /* * Samsung drive that doesn't support READ LOG EXT or * READ LOG DMA EXT, despite reporting that it does in * ATA identify data: * SAMSUNG HD501LJ CR100-10 */ { T_DIRECT, SIP_MEDIA_FIXED, "*", "SAMSUNG HD501*", "*" }, /*quirks*/ADA_Q_LOG_BROKEN }, { /* * Seagate Lamarr 8TB Shingled Magnetic Recording (SMR) * Drive Managed SATA hard drive. This drive doesn't report * in firmware that it is a drive managed SMR drive. */ { T_DIRECT, SIP_MEDIA_FIXED, "*", "ST8000AS000[23]*", "*" }, /*quirks*/ADA_Q_SMR_DM }, { /* Default */ { T_ANY, SIP_MEDIA_REMOVABLE|SIP_MEDIA_FIXED, /*vendor*/"*", /*product*/"*", /*revision*/"*" }, /*quirks*/0 }, }; static disk_strategy_t adastrategy; static dumper_t adadump; static periph_init_t adainit; static void adadiskgonecb(struct disk *dp); static periph_oninv_t adaoninvalidate; static periph_dtor_t adacleanup; static void adaasync(void *callback_arg, u_int32_t code, struct cam_path *path, void *arg); static int adazonemodesysctl(SYSCTL_HANDLER_ARGS); static int adazonesupsysctl(SYSCTL_HANDLER_ARGS); static void adasysctlinit(void *context, int pending); static int adagetattr(struct bio *bp); static void adasetflags(struct ada_softc *softc, struct ccb_getdev *cgd); static periph_ctor_t adaregister; static void ada_dsmtrim(struct ada_softc *softc, struct bio *bp, struct ccb_ataio *ataio); static void ada_cfaerase(struct ada_softc *softc, struct bio *bp, struct ccb_ataio *ataio); static int ada_zone_bio_to_ata(int disk_zone_cmd); static int ada_zone_cmd(struct cam_periph *periph, union ccb *ccb, struct bio *bp, int *queue_ccb); static periph_start_t adastart; static void adaprobedone(struct cam_periph *periph, union ccb *ccb); static void adazonedone(struct cam_periph *periph, union ccb *ccb); static void adadone(struct cam_periph *periph, union ccb *done_ccb); static int adaerror(union ccb *ccb, u_int32_t cam_flags, u_int32_t sense_flags); static void adagetparams(struct cam_periph *periph, struct ccb_getdev *cgd); static timeout_t adasendorderedtag; static void adashutdown(void *arg, int howto); static void adasuspend(void *arg); static void adaresume(void *arg); #ifndef ADA_DEFAULT_TIMEOUT #define ADA_DEFAULT_TIMEOUT 30 /* Timeout in seconds */ #endif #ifndef ADA_DEFAULT_RETRY #define ADA_DEFAULT_RETRY 4 #endif #ifndef ADA_DEFAULT_SEND_ORDERED #define ADA_DEFAULT_SEND_ORDERED 1 #endif #ifndef ADA_DEFAULT_SPINDOWN_SHUTDOWN #define ADA_DEFAULT_SPINDOWN_SHUTDOWN 1 #endif #ifndef ADA_DEFAULT_SPINDOWN_SUSPEND #define ADA_DEFAULT_SPINDOWN_SUSPEND 1 #endif #ifndef ADA_DEFAULT_READ_AHEAD #define ADA_DEFAULT_READ_AHEAD 1 #endif #ifndef ADA_DEFAULT_WRITE_CACHE #define ADA_DEFAULT_WRITE_CACHE 1 #endif #define ADA_RA (softc->read_ahead >= 0 ? \ softc->read_ahead : ada_read_ahead) #define ADA_WC (softc->write_cache >= 0 ? \ softc->write_cache : ada_write_cache) /* * Most platforms map firmware geometry to actual, but some don't. If * not overridden, default to nothing. */ #ifndef ata_disk_firmware_geom_adjust #define ata_disk_firmware_geom_adjust(disk) #endif static int ada_retry_count = ADA_DEFAULT_RETRY; static int ada_default_timeout = ADA_DEFAULT_TIMEOUT; static int ada_send_ordered = ADA_DEFAULT_SEND_ORDERED; static int ada_spindown_shutdown = ADA_DEFAULT_SPINDOWN_SHUTDOWN; static int ada_spindown_suspend = ADA_DEFAULT_SPINDOWN_SUSPEND; static int ada_read_ahead = ADA_DEFAULT_READ_AHEAD; static int ada_write_cache = ADA_DEFAULT_WRITE_CACHE; static SYSCTL_NODE(_kern_cam, OID_AUTO, ada, CTLFLAG_RD, 0, "CAM Direct Access Disk driver"); SYSCTL_INT(_kern_cam_ada, OID_AUTO, retry_count, CTLFLAG_RWTUN, &ada_retry_count, 0, "Normal I/O retry count"); SYSCTL_INT(_kern_cam_ada, OID_AUTO, default_timeout, CTLFLAG_RWTUN, &ada_default_timeout, 0, "Normal I/O timeout (in seconds)"); SYSCTL_INT(_kern_cam_ada, OID_AUTO, send_ordered, CTLFLAG_RWTUN, &ada_send_ordered, 0, "Send Ordered Tags"); SYSCTL_INT(_kern_cam_ada, OID_AUTO, spindown_shutdown, CTLFLAG_RWTUN, &ada_spindown_shutdown, 0, "Spin down upon shutdown"); SYSCTL_INT(_kern_cam_ada, OID_AUTO, spindown_suspend, CTLFLAG_RWTUN, &ada_spindown_suspend, 0, "Spin down upon suspend"); SYSCTL_INT(_kern_cam_ada, OID_AUTO, read_ahead, CTLFLAG_RWTUN, &ada_read_ahead, 0, "Enable disk read-ahead"); SYSCTL_INT(_kern_cam_ada, OID_AUTO, write_cache, CTLFLAG_RWTUN, &ada_write_cache, 0, "Enable disk write cache"); /* * ADA_ORDEREDTAG_INTERVAL determines how often, relative * to the default timeout, we check to see whether an ordered * tagged transaction is appropriate to prevent simple tag * starvation. Since we'd like to ensure that there is at least * 1/2 of the timeout length left for a starved transaction to * complete after we've sent an ordered tag, we must poll at least * four times in every timeout period. This takes care of the worst * case where a starved transaction starts during an interval that * meets the requirement "don't send an ordered tag" test so it takes * us two intervals to determine that a tag must be sent. */ #ifndef ADA_ORDEREDTAG_INTERVAL #define ADA_ORDEREDTAG_INTERVAL 4 #endif static struct periph_driver adadriver = { adainit, "ada", TAILQ_HEAD_INITIALIZER(adadriver.units), /* generation */ 0 }; static int adadeletemethodsysctl(SYSCTL_HANDLER_ARGS); PERIPHDRIVER_DECLARE(ada, adadriver); static MALLOC_DEFINE(M_ATADA, "ata_da", "ata_da buffers"); static int adaopen(struct disk *dp) { struct cam_periph *periph; struct ada_softc *softc; int error; periph = (struct cam_periph *)dp->d_drv1; if (cam_periph_acquire(periph) != 0) { return(ENXIO); } cam_periph_lock(periph); if ((error = cam_periph_hold(periph, PRIBIO|PCATCH)) != 0) { cam_periph_unlock(periph); cam_periph_release(periph); return (error); } CAM_DEBUG(periph->path, CAM_DEBUG_TRACE | CAM_DEBUG_PERIPH, ("adaopen\n")); softc = (struct ada_softc *)periph->softc; softc->flags |= ADA_FLAG_OPEN; cam_periph_unhold(periph); cam_periph_unlock(periph); return (0); } static int adaclose(struct disk *dp) { struct cam_periph *periph; struct ada_softc *softc; union ccb *ccb; int error; periph = (struct cam_periph *)dp->d_drv1; softc = (struct ada_softc *)periph->softc; cam_periph_lock(periph); CAM_DEBUG(periph->path, CAM_DEBUG_TRACE | CAM_DEBUG_PERIPH, ("adaclose\n")); /* We only sync the cache if the drive is capable of it. */ if ((softc->flags & ADA_FLAG_DIRTY) != 0 && (softc->flags & ADA_FLAG_CAN_FLUSHCACHE) != 0 && (periph->flags & CAM_PERIPH_INVALID) == 0 && cam_periph_hold(periph, PRIBIO) == 0) { ccb = cam_periph_getccb(periph, CAM_PRIORITY_NORMAL); cam_fill_ataio(&ccb->ataio, 1, adadone, CAM_DIR_NONE, 0, NULL, 0, ada_default_timeout*1000); if (softc->flags & ADA_FLAG_CAN_48BIT) ata_48bit_cmd(&ccb->ataio, ATA_FLUSHCACHE48, 0, 0, 0); else ata_28bit_cmd(&ccb->ataio, ATA_FLUSHCACHE, 0, 0, 0); error = cam_periph_runccb(ccb, adaerror, /*cam_flags*/0, /*sense_flags*/0, softc->disk->d_devstat); if (error != 0) xpt_print(periph->path, "Synchronize cache failed\n"); softc->flags &= ~ADA_FLAG_DIRTY; xpt_release_ccb(ccb); cam_periph_unhold(periph); } softc->flags &= ~ADA_FLAG_OPEN; while (softc->refcount != 0) cam_periph_sleep(periph, &softc->refcount, PRIBIO, "adaclose", 1); cam_periph_unlock(periph); cam_periph_release(periph); return (0); } static void adaschedule(struct cam_periph *periph) { struct ada_softc *softc = (struct ada_softc *)periph->softc; if (softc->state != ADA_STATE_NORMAL) return; cam_iosched_schedule(softc->cam_iosched, periph); } /* * Actually translate the requested transfer into one the physical driver * can understand. The transfer is described by a buf and will include * only one physical transfer. */ static void adastrategy(struct bio *bp) { struct cam_periph *periph; struct ada_softc *softc; periph = (struct cam_periph *)bp->bio_disk->d_drv1; softc = (struct ada_softc *)periph->softc; cam_periph_lock(periph); CAM_DEBUG(periph->path, CAM_DEBUG_TRACE, ("adastrategy(%p)\n", bp)); /* * If the device has been made invalid, error out */ if ((periph->flags & CAM_PERIPH_INVALID) != 0) { cam_periph_unlock(periph); biofinish(bp, NULL, ENXIO); return; } /* * Zone commands must be ordered, because they can depend on the * effects of previously issued commands, and they may affect * commands after them. */ if (bp->bio_cmd == BIO_ZONE) bp->bio_flags |= BIO_ORDERED; /* * Place it in the queue of disk activities for this disk */ cam_iosched_queue_work(softc->cam_iosched, bp); /* * Schedule ourselves for performing the work. */ adaschedule(periph); cam_periph_unlock(periph); return; } static int adadump(void *arg, void *virtual, vm_offset_t physical, off_t offset, size_t length) { struct cam_periph *periph; struct ada_softc *softc; u_int secsize; struct ccb_ataio ataio; struct disk *dp; uint64_t lba; uint16_t count; int error = 0; dp = arg; periph = dp->d_drv1; softc = (struct ada_softc *)periph->softc; secsize = softc->params.secsize; lba = offset / secsize; count = length / secsize; if ((periph->flags & CAM_PERIPH_INVALID) != 0) return (ENXIO); memset(&ataio, 0, sizeof(ataio)); if (length > 0) { xpt_setup_ccb(&ataio.ccb_h, periph->path, CAM_PRIORITY_NORMAL); ataio.ccb_h.ccb_state = ADA_CCB_DUMP; cam_fill_ataio(&ataio, 0, adadone, CAM_DIR_OUT, 0, (u_int8_t *) virtual, length, ada_default_timeout*1000); if ((softc->flags & ADA_FLAG_CAN_48BIT) && (lba + count >= ATA_MAX_28BIT_LBA || count >= 256)) { ata_48bit_cmd(&ataio, ATA_WRITE_DMA48, 0, lba, count); } else { ata_28bit_cmd(&ataio, ATA_WRITE_DMA, 0, lba, count); } error = cam_periph_runccb((union ccb *)&ataio, adaerror, 0, SF_NO_RECOVERY | SF_NO_RETRY, NULL); if (error != 0) printf("Aborting dump due to I/O error.\n"); return (error); } if (softc->flags & ADA_FLAG_CAN_FLUSHCACHE) { xpt_setup_ccb(&ataio.ccb_h, periph->path, CAM_PRIORITY_NORMAL); /* * Tell the drive to flush its internal cache. if we * can't flush in 5s we have big problems. No need to * wait the default 60s to detect problems. */ ataio.ccb_h.ccb_state = ADA_CCB_DUMP; cam_fill_ataio(&ataio, 0, adadone, CAM_DIR_NONE, 0, NULL, 0, 5*1000); if (softc->flags & ADA_FLAG_CAN_48BIT) ata_48bit_cmd(&ataio, ATA_FLUSHCACHE48, 0, 0, 0); else ata_28bit_cmd(&ataio, ATA_FLUSHCACHE, 0, 0, 0); error = cam_periph_runccb((union ccb *)&ataio, adaerror, 0, SF_NO_RECOVERY | SF_NO_RETRY, NULL); if (error != 0) xpt_print(periph->path, "Synchronize cache failed\n"); } return (error); } static void adainit(void) { cam_status status; /* * Install a global async callback. This callback will * receive async callbacks like "new device found". */ status = xpt_register_async(AC_FOUND_DEVICE, adaasync, NULL, NULL); if (status != CAM_REQ_CMP) { printf("ada: Failed to attach master async callback " "due to status 0x%x!\n", status); } else if (ada_send_ordered) { /* Register our event handlers */ if ((EVENTHANDLER_REGISTER(power_suspend, adasuspend, NULL, EVENTHANDLER_PRI_LAST)) == NULL) printf("adainit: power event registration failed!\n"); if ((EVENTHANDLER_REGISTER(power_resume, adaresume, NULL, EVENTHANDLER_PRI_LAST)) == NULL) printf("adainit: power event registration failed!\n"); if ((EVENTHANDLER_REGISTER(shutdown_post_sync, adashutdown, NULL, SHUTDOWN_PRI_DEFAULT)) == NULL) printf("adainit: shutdown event registration failed!\n"); } } /* * Callback from GEOM, called when it has finished cleaning up its * resources. */ static void adadiskgonecb(struct disk *dp) { struct cam_periph *periph; periph = (struct cam_periph *)dp->d_drv1; cam_periph_release(periph); } static void adaoninvalidate(struct cam_periph *periph) { struct ada_softc *softc; softc = (struct ada_softc *)periph->softc; /* * De-register any async callbacks. */ xpt_register_async(0, adaasync, periph, periph->path); #ifdef CAM_IO_STATS softc->invalidations++; #endif /* * Return all queued I/O with ENXIO. * XXX Handle any transactions queued to the card * with XPT_ABORT_CCB. */ cam_iosched_flush(softc->cam_iosched, NULL, ENXIO); disk_gone(softc->disk); } static void adacleanup(struct cam_periph *periph) { struct ada_softc *softc; softc = (struct ada_softc *)periph->softc; cam_periph_unlock(periph); cam_iosched_fini(softc->cam_iosched); /* * If we can't free the sysctl tree, oh well... */ if ((softc->flags & ADA_FLAG_SCTX_INIT) != 0) { #ifdef CAM_IO_STATS if (sysctl_ctx_free(&softc->sysctl_stats_ctx) != 0) xpt_print(periph->path, "can't remove sysctl stats context\n"); #endif if (sysctl_ctx_free(&softc->sysctl_ctx) != 0) xpt_print(periph->path, "can't remove sysctl context\n"); } disk_destroy(softc->disk); callout_drain(&softc->sendordered_c); free(softc, M_DEVBUF); cam_periph_lock(periph); } static void adasetdeletemethod(struct ada_softc *softc) { if (softc->flags & ADA_FLAG_CAN_NCQ_TRIM) softc->delete_method = ADA_DELETE_NCQ_DSM_TRIM; else if (softc->flags & ADA_FLAG_CAN_TRIM) softc->delete_method = ADA_DELETE_DSM_TRIM; else if ((softc->flags & ADA_FLAG_CAN_CFA) && !(softc->flags & ADA_FLAG_CAN_48BIT)) softc->delete_method = ADA_DELETE_CFA_ERASE; else softc->delete_method = ADA_DELETE_NONE; } static void adaasync(void *callback_arg, u_int32_t code, struct cam_path *path, void *arg) { struct ccb_getdev cgd; struct cam_periph *periph; struct ada_softc *softc; periph = (struct cam_periph *)callback_arg; switch (code) { case AC_FOUND_DEVICE: { struct ccb_getdev *cgd; cam_status status; cgd = (struct ccb_getdev *)arg; if (cgd == NULL) break; if (cgd->protocol != PROTO_ATA) break; /* * Allocate a peripheral instance for * this device and start the probe * process. */ status = cam_periph_alloc(adaregister, adaoninvalidate, adacleanup, adastart, "ada", CAM_PERIPH_BIO, path, adaasync, AC_FOUND_DEVICE, cgd); if (status != CAM_REQ_CMP && status != CAM_REQ_INPROG) printf("adaasync: Unable to attach to new device " "due to status 0x%x\n", status); break; } case AC_GETDEV_CHANGED: { softc = (struct ada_softc *)periph->softc; xpt_setup_ccb(&cgd.ccb_h, periph->path, CAM_PRIORITY_NORMAL); cgd.ccb_h.func_code = XPT_GDEV_TYPE; xpt_action((union ccb *)&cgd); /* * Set/clear support flags based on the new Identify data. */ adasetflags(softc, &cgd); cam_periph_async(periph, code, path, arg); break; } case AC_ADVINFO_CHANGED: { uintptr_t buftype; buftype = (uintptr_t)arg; if (buftype == CDAI_TYPE_PHYS_PATH) { struct ada_softc *softc; softc = periph->softc; disk_attr_changed(softc->disk, "GEOM::physpath", M_NOWAIT); } break; } case AC_SENT_BDR: case AC_BUS_RESET: { softc = (struct ada_softc *)periph->softc; cam_periph_async(periph, code, path, arg); if (softc->state != ADA_STATE_NORMAL) break; xpt_setup_ccb(&cgd.ccb_h, periph->path, CAM_PRIORITY_NORMAL); cgd.ccb_h.func_code = XPT_GDEV_TYPE; xpt_action((union ccb *)&cgd); if (ADA_RA >= 0 && softc->flags & ADA_FLAG_CAN_RAHEAD) softc->state = ADA_STATE_RAHEAD; else if (ADA_WC >= 0 && softc->flags & ADA_FLAG_CAN_WCACHE) softc->state = ADA_STATE_WCACHE; else if ((softc->flags & ADA_FLAG_CAN_LOG) && (softc->zone_mode != ADA_ZONE_NONE)) softc->state = ADA_STATE_LOGDIR; else break; if (cam_periph_acquire(periph) != 0) softc->state = ADA_STATE_NORMAL; else xpt_schedule(periph, CAM_PRIORITY_DEV); } default: cam_periph_async(periph, code, path, arg); break; } } static int adazonemodesysctl(SYSCTL_HANDLER_ARGS) { char tmpbuf[40]; struct ada_softc *softc; int error; softc = (struct ada_softc *)arg1; switch (softc->zone_mode) { case ADA_ZONE_DRIVE_MANAGED: snprintf(tmpbuf, sizeof(tmpbuf), "Drive Managed"); break; case ADA_ZONE_HOST_AWARE: snprintf(tmpbuf, sizeof(tmpbuf), "Host Aware"); break; case ADA_ZONE_HOST_MANAGED: snprintf(tmpbuf, sizeof(tmpbuf), "Host Managed"); break; case ADA_ZONE_NONE: default: snprintf(tmpbuf, sizeof(tmpbuf), "Not Zoned"); break; } error = sysctl_handle_string(oidp, tmpbuf, sizeof(tmpbuf), req); return (error); } static int adazonesupsysctl(SYSCTL_HANDLER_ARGS) { char tmpbuf[180]; struct ada_softc *softc; struct sbuf sb; int error, first; unsigned int i; softc = (struct ada_softc *)arg1; error = 0; first = 1; sbuf_new(&sb, tmpbuf, sizeof(tmpbuf), 0); for (i = 0; i < sizeof(ada_zone_desc_table) / sizeof(ada_zone_desc_table[0]); i++) { if (softc->zone_flags & ada_zone_desc_table[i].value) { if (first == 0) sbuf_printf(&sb, ", "); else first = 0; sbuf_cat(&sb, ada_zone_desc_table[i].desc); } } if (first == 1) sbuf_printf(&sb, "None"); sbuf_finish(&sb); error = sysctl_handle_string(oidp, sbuf_data(&sb), sbuf_len(&sb), req); return (error); } static void adasysctlinit(void *context, int pending) { struct cam_periph *periph; struct ada_softc *softc; char tmpstr[32], tmpstr2[16]; periph = (struct cam_periph *)context; /* periph was held for us when this task was enqueued */ if ((periph->flags & CAM_PERIPH_INVALID) != 0) { cam_periph_release(periph); return; } softc = (struct ada_softc *)periph->softc; snprintf(tmpstr, sizeof(tmpstr), "CAM ADA unit %d",periph->unit_number); snprintf(tmpstr2, sizeof(tmpstr2), "%d", periph->unit_number); sysctl_ctx_init(&softc->sysctl_ctx); softc->flags |= ADA_FLAG_SCTX_INIT; softc->sysctl_tree = SYSCTL_ADD_NODE_WITH_LABEL(&softc->sysctl_ctx, SYSCTL_STATIC_CHILDREN(_kern_cam_ada), OID_AUTO, tmpstr2, CTLFLAG_RD, 0, tmpstr, "device_index"); if (softc->sysctl_tree == NULL) { printf("adasysctlinit: unable to allocate sysctl tree\n"); cam_periph_release(periph); return; } SYSCTL_ADD_PROC(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree), OID_AUTO, "delete_method", CTLTYPE_STRING | CTLFLAG_RW, softc, 0, adadeletemethodsysctl, "A", "BIO_DELETE execution method"); SYSCTL_ADD_INT(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree), OID_AUTO, "read_ahead", CTLFLAG_RW | CTLFLAG_MPSAFE, &softc->read_ahead, 0, "Enable disk read ahead."); SYSCTL_ADD_INT(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree), OID_AUTO, "write_cache", CTLFLAG_RW | CTLFLAG_MPSAFE, &softc->write_cache, 0, "Enable disk write cache."); SYSCTL_ADD_INT(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree), OID_AUTO, "unmapped_io", CTLFLAG_RD | CTLFLAG_MPSAFE, &softc->unmappedio, 0, "Unmapped I/O leaf"); SYSCTL_ADD_INT(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree), OID_AUTO, "rotating", CTLFLAG_RD | CTLFLAG_MPSAFE, &softc->rotating, 0, "Rotating media"); SYSCTL_ADD_PROC(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree), OID_AUTO, "zone_mode", CTLTYPE_STRING | CTLFLAG_RD, softc, 0, adazonemodesysctl, "A", "Zone Mode"); SYSCTL_ADD_PROC(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree), OID_AUTO, "zone_support", CTLTYPE_STRING | CTLFLAG_RD, softc, 0, adazonesupsysctl, "A", "Zone Support"); SYSCTL_ADD_UQUAD(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree), OID_AUTO, "optimal_seq_zones", CTLFLAG_RD, &softc->optimal_seq_zones, "Optimal Number of Open Sequential Write Preferred Zones"); SYSCTL_ADD_UQUAD(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree), OID_AUTO, "optimal_nonseq_zones", CTLFLAG_RD, &softc->optimal_nonseq_zones, "Optimal Number of Non-Sequentially Written Sequential Write " "Preferred Zones"); SYSCTL_ADD_UQUAD(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree), OID_AUTO, "max_seq_zones", CTLFLAG_RD, &softc->max_seq_zones, "Maximum Number of Open Sequential Write Required Zones"); #ifdef ADA_TEST_FAILURE /* * Add a 'door bell' sysctl which allows one to set it from userland * and cause something bad to happen. For the moment, we only allow * whacking the next read or write. */ SYSCTL_ADD_INT(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree), OID_AUTO, "force_read_error", CTLFLAG_RW | CTLFLAG_MPSAFE, &softc->force_read_error, 0, "Force a read error for the next N reads."); SYSCTL_ADD_INT(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree), OID_AUTO, "force_write_error", CTLFLAG_RW | CTLFLAG_MPSAFE, &softc->force_write_error, 0, "Force a write error for the next N writes."); SYSCTL_ADD_INT(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree), OID_AUTO, "periodic_read_error", CTLFLAG_RW | CTLFLAG_MPSAFE, &softc->periodic_read_error, 0, "Force a read error every N reads (don't set too low)."); #endif #ifdef CAM_IO_STATS softc->sysctl_stats_tree = SYSCTL_ADD_NODE(&softc->sysctl_stats_ctx, SYSCTL_CHILDREN(softc->sysctl_tree), OID_AUTO, "stats", CTLFLAG_RD, 0, "Statistics"); SYSCTL_ADD_INT(&softc->sysctl_stats_ctx, SYSCTL_CHILDREN(softc->sysctl_stats_tree), OID_AUTO, "timeouts", CTLFLAG_RD | CTLFLAG_MPSAFE, &softc->timeouts, 0, "Device timeouts reported by the SIM"); SYSCTL_ADD_INT(&softc->sysctl_stats_ctx, SYSCTL_CHILDREN(softc->sysctl_stats_tree), OID_AUTO, "errors", CTLFLAG_RD | CTLFLAG_MPSAFE, &softc->errors, 0, "Transport errors reported by the SIM."); SYSCTL_ADD_INT(&softc->sysctl_stats_ctx, SYSCTL_CHILDREN(softc->sysctl_stats_tree), OID_AUTO, "pack_invalidations", CTLFLAG_RD | CTLFLAG_MPSAFE, &softc->invalidations, 0, "Device pack invalidations."); #endif cam_iosched_sysctl_init(softc->cam_iosched, &softc->sysctl_ctx, softc->sysctl_tree); cam_periph_release(periph); } static int adagetattr(struct bio *bp) { int ret; struct cam_periph *periph; periph = (struct cam_periph *)bp->bio_disk->d_drv1; cam_periph_lock(periph); ret = xpt_getattr(bp->bio_data, bp->bio_length, bp->bio_attribute, periph->path); cam_periph_unlock(periph); if (ret == 0) bp->bio_completed = bp->bio_length; return ret; } static int adadeletemethodsysctl(SYSCTL_HANDLER_ARGS) { char buf[16]; const char *p; struct ada_softc *softc; int i, error, value, methods; softc = (struct ada_softc *)arg1; value = softc->delete_method; if (value < 0 || value > ADA_DELETE_MAX) p = "UNKNOWN"; else p = ada_delete_method_names[value]; strncpy(buf, p, sizeof(buf)); error = sysctl_handle_string(oidp, buf, sizeof(buf), req); if (error != 0 || req->newptr == NULL) return (error); methods = 1 << ADA_DELETE_DISABLE; if ((softc->flags & ADA_FLAG_CAN_CFA) && !(softc->flags & ADA_FLAG_CAN_48BIT)) methods |= 1 << ADA_DELETE_CFA_ERASE; if (softc->flags & ADA_FLAG_CAN_TRIM) methods |= 1 << ADA_DELETE_DSM_TRIM; if (softc->flags & ADA_FLAG_CAN_NCQ_TRIM) methods |= 1 << ADA_DELETE_NCQ_DSM_TRIM; for (i = 0; i <= ADA_DELETE_MAX; i++) { if (!(methods & (1 << i)) || strcmp(buf, ada_delete_method_names[i]) != 0) continue; softc->delete_method = i; return (0); } return (EINVAL); } static void adasetflags(struct ada_softc *softc, struct ccb_getdev *cgd) { if ((cgd->ident_data.capabilities1 & ATA_SUPPORT_DMA) && (cgd->inq_flags & SID_DMA)) softc->flags |= ADA_FLAG_CAN_DMA; else softc->flags &= ~ADA_FLAG_CAN_DMA; if (cgd->ident_data.support.command2 & ATA_SUPPORT_ADDRESS48) { softc->flags |= ADA_FLAG_CAN_48BIT; if (cgd->inq_flags & SID_DMA48) softc->flags |= ADA_FLAG_CAN_DMA48; else softc->flags &= ~ADA_FLAG_CAN_DMA48; } else softc->flags &= ~(ADA_FLAG_CAN_48BIT | ADA_FLAG_CAN_DMA48); if (cgd->ident_data.support.command2 & ATA_SUPPORT_FLUSHCACHE) softc->flags |= ADA_FLAG_CAN_FLUSHCACHE; else softc->flags &= ~ADA_FLAG_CAN_FLUSHCACHE; if (cgd->ident_data.support.command1 & ATA_SUPPORT_POWERMGT) softc->flags |= ADA_FLAG_CAN_POWERMGT; else softc->flags &= ~ADA_FLAG_CAN_POWERMGT; if ((cgd->ident_data.satacapabilities & ATA_SUPPORT_NCQ) && (cgd->inq_flags & SID_DMA) && (cgd->inq_flags & SID_CmdQue)) softc->flags |= ADA_FLAG_CAN_NCQ; else softc->flags &= ~ADA_FLAG_CAN_NCQ; if ((cgd->ident_data.support_dsm & ATA_SUPPORT_DSM_TRIM) && (cgd->inq_flags & SID_DMA)) { softc->flags |= ADA_FLAG_CAN_TRIM; softc->trim_max_ranges = TRIM_MAX_RANGES; if (cgd->ident_data.max_dsm_blocks != 0) { softc->trim_max_ranges = min(cgd->ident_data.max_dsm_blocks * ATA_DSM_BLK_RANGES, softc->trim_max_ranges); } /* * If we can do RCVSND_FPDMA_QUEUED commands, we may be able * to do NCQ trims, if we support trims at all. We also need * support from the SIM to do things properly. Perhaps we * should look at log 13 dword 0 bit 0 and dword 1 bit 0 are * set too... */ if ((softc->quirks & ADA_Q_NCQ_TRIM_BROKEN) == 0 && (softc->flags & ADA_FLAG_PIM_ATA_EXT) != 0 && (cgd->ident_data.satacapabilities2 & ATA_SUPPORT_RCVSND_FPDMA_QUEUED) != 0 && (softc->flags & ADA_FLAG_CAN_TRIM) != 0) softc->flags |= ADA_FLAG_CAN_NCQ_TRIM; else softc->flags &= ~ADA_FLAG_CAN_NCQ_TRIM; } else softc->flags &= ~(ADA_FLAG_CAN_TRIM | ADA_FLAG_CAN_NCQ_TRIM); if (cgd->ident_data.support.command2 & ATA_SUPPORT_CFA) softc->flags |= ADA_FLAG_CAN_CFA; else softc->flags &= ~ADA_FLAG_CAN_CFA; /* * Now that we've set the appropriate flags, setup the delete * method. */ adasetdeletemethod(softc); if ((cgd->ident_data.support.extension & ATA_SUPPORT_GENLOG) && ((softc->quirks & ADA_Q_LOG_BROKEN) == 0)) softc->flags |= ADA_FLAG_CAN_LOG; else softc->flags &= ~ADA_FLAG_CAN_LOG; if ((cgd->ident_data.support3 & ATA_SUPPORT_ZONE_MASK) == ATA_SUPPORT_ZONE_HOST_AWARE) softc->zone_mode = ADA_ZONE_HOST_AWARE; else if (((cgd->ident_data.support3 & ATA_SUPPORT_ZONE_MASK) == ATA_SUPPORT_ZONE_DEV_MANAGED) || (softc->quirks & ADA_Q_SMR_DM)) softc->zone_mode = ADA_ZONE_DRIVE_MANAGED; else softc->zone_mode = ADA_ZONE_NONE; if (cgd->ident_data.support.command1 & ATA_SUPPORT_LOOKAHEAD) softc->flags |= ADA_FLAG_CAN_RAHEAD; else softc->flags &= ~ADA_FLAG_CAN_RAHEAD; if (cgd->ident_data.support.command1 & ATA_SUPPORT_WRITECACHE) softc->flags |= ADA_FLAG_CAN_WCACHE; else softc->flags &= ~ADA_FLAG_CAN_WCACHE; } static cam_status adaregister(struct cam_periph *periph, void *arg) { struct ada_softc *softc; struct ccb_pathinq cpi; struct ccb_getdev *cgd; struct disk_params *dp; struct sbuf sb; char *announce_buf; caddr_t match; u_int maxio; int quirks; cgd = (struct ccb_getdev *)arg; if (cgd == NULL) { printf("adaregister: no getdev CCB, can't register device\n"); return(CAM_REQ_CMP_ERR); } softc = (struct ada_softc *)malloc(sizeof(*softc), M_DEVBUF, M_NOWAIT|M_ZERO); if (softc == NULL) { printf("adaregister: Unable to probe new device. " "Unable to allocate softc\n"); return(CAM_REQ_CMP_ERR); } announce_buf = softc->announce_temp; bzero(announce_buf, ADA_ANNOUNCETMP_SZ); - if (cam_iosched_init(&softc->cam_iosched, periph, 0) != 0) { + if (cam_iosched_init(&softc->cam_iosched, periph) != 0) { printf("adaregister: Unable to probe new device. " "Unable to allocate iosched memory\n"); free(softc, M_DEVBUF); return(CAM_REQ_CMP_ERR); } periph->softc = softc; /* * See if this device has any quirks. */ match = cam_quirkmatch((caddr_t)&cgd->ident_data, (caddr_t)ada_quirk_table, nitems(ada_quirk_table), sizeof(*ada_quirk_table), ata_identify_match); if (match != NULL) softc->quirks = ((struct ada_quirk_entry *)match)->quirks; else softc->quirks = ADA_Q_NONE; xpt_path_inq(&cpi, periph->path); TASK_INIT(&softc->sysctl_task, 0, adasysctlinit, periph); /* * Register this media as a disk */ (void)cam_periph_hold(periph, PRIBIO); cam_periph_unlock(periph); snprintf(announce_buf, ADA_ANNOUNCETMP_SZ, "kern.cam.ada.%d.quirks", periph->unit_number); quirks = softc->quirks; TUNABLE_INT_FETCH(announce_buf, &quirks); softc->quirks = quirks; softc->read_ahead = -1; snprintf(announce_buf, ADA_ANNOUNCETMP_SZ, "kern.cam.ada.%d.read_ahead", periph->unit_number); TUNABLE_INT_FETCH(announce_buf, &softc->read_ahead); softc->write_cache = -1; snprintf(announce_buf, ADA_ANNOUNCETMP_SZ, "kern.cam.ada.%d.write_cache", periph->unit_number); TUNABLE_INT_FETCH(announce_buf, &softc->write_cache); /* * Set support flags based on the Identify data and quirks. */ adasetflags(softc, cgd); /* Disable queue sorting for non-rotational media by default. */ if (cgd->ident_data.media_rotation_rate == ATA_RATE_NON_ROTATING) { softc->rotating = 0; } else { softc->rotating = 1; } cam_iosched_set_sort_queue(softc->cam_iosched, softc->rotating ? -1 : 0); adagetparams(periph, cgd); softc->disk = disk_alloc(); softc->disk->d_rotation_rate = cgd->ident_data.media_rotation_rate; softc->disk->d_devstat = devstat_new_entry(periph->periph_name, periph->unit_number, softc->params.secsize, DEVSTAT_ALL_SUPPORTED, DEVSTAT_TYPE_DIRECT | XPORT_DEVSTAT_TYPE(cpi.transport), DEVSTAT_PRIORITY_DISK); softc->disk->d_open = adaopen; softc->disk->d_close = adaclose; softc->disk->d_strategy = adastrategy; softc->disk->d_getattr = adagetattr; softc->disk->d_dump = adadump; softc->disk->d_gone = adadiskgonecb; softc->disk->d_name = "ada"; softc->disk->d_drv1 = periph; maxio = cpi.maxio; /* Honor max I/O size of SIM */ if (maxio == 0) maxio = DFLTPHYS; /* traditional default */ else if (maxio > MAXPHYS) maxio = MAXPHYS; /* for safety */ if (softc->flags & ADA_FLAG_CAN_48BIT) maxio = min(maxio, 65536 * softc->params.secsize); else /* 28bit ATA command limit */ maxio = min(maxio, 256 * softc->params.secsize); softc->disk->d_maxsize = maxio; softc->disk->d_unit = periph->unit_number; softc->disk->d_flags = DISKFLAG_DIRECT_COMPLETION | DISKFLAG_CANZONE; if (softc->flags & ADA_FLAG_CAN_FLUSHCACHE) softc->disk->d_flags |= DISKFLAG_CANFLUSHCACHE; if (softc->flags & ADA_FLAG_CAN_TRIM) { softc->disk->d_flags |= DISKFLAG_CANDELETE; softc->disk->d_delmaxsize = softc->params.secsize * ATA_DSM_RANGE_MAX * softc->trim_max_ranges; } else if ((softc->flags & ADA_FLAG_CAN_CFA) && !(softc->flags & ADA_FLAG_CAN_48BIT)) { softc->disk->d_flags |= DISKFLAG_CANDELETE; softc->disk->d_delmaxsize = 256 * softc->params.secsize; } else softc->disk->d_delmaxsize = maxio; if ((cpi.hba_misc & PIM_UNMAPPED) != 0) { softc->disk->d_flags |= DISKFLAG_UNMAPPED_BIO; softc->unmappedio = 1; } if (cpi.hba_misc & PIM_ATA_EXT) softc->flags |= ADA_FLAG_PIM_ATA_EXT; strlcpy(softc->disk->d_descr, cgd->ident_data.model, MIN(sizeof(softc->disk->d_descr), sizeof(cgd->ident_data.model))); strlcpy(softc->disk->d_ident, cgd->ident_data.serial, MIN(sizeof(softc->disk->d_ident), sizeof(cgd->ident_data.serial))); softc->disk->d_hba_vendor = cpi.hba_vendor; softc->disk->d_hba_device = cpi.hba_device; softc->disk->d_hba_subvendor = cpi.hba_subvendor; softc->disk->d_hba_subdevice = cpi.hba_subdevice; softc->disk->d_sectorsize = softc->params.secsize; softc->disk->d_mediasize = (off_t)softc->params.sectors * softc->params.secsize; if (ata_physical_sector_size(&cgd->ident_data) != softc->params.secsize) { softc->disk->d_stripesize = ata_physical_sector_size(&cgd->ident_data); softc->disk->d_stripeoffset = (softc->disk->d_stripesize - ata_logical_sector_offset(&cgd->ident_data)) % softc->disk->d_stripesize; } else if (softc->quirks & ADA_Q_4K) { softc->disk->d_stripesize = 4096; softc->disk->d_stripeoffset = 0; } softc->disk->d_fwsectors = softc->params.secs_per_track; softc->disk->d_fwheads = softc->params.heads; ata_disk_firmware_geom_adjust(softc->disk); /* * Acquire a reference to the periph before we register with GEOM. * We'll release this reference once GEOM calls us back (via * adadiskgonecb()) telling us that our provider has been freed. */ if (cam_periph_acquire(periph) != 0) { xpt_print(periph->path, "%s: lost periph during " "registration!\n", __func__); cam_periph_lock(periph); return (CAM_REQ_CMP_ERR); } disk_create(softc->disk, DISK_VERSION); cam_periph_lock(periph); dp = &softc->params; snprintf(announce_buf, ADA_ANNOUNCETMP_SZ, "%juMB (%ju %u byte sectors)", ((uintmax_t)dp->secsize * dp->sectors) / (1024 * 1024), (uintmax_t)dp->sectors, dp->secsize); sbuf_new(&sb, softc->announce_buffer, ADA_ANNOUNCE_SZ, SBUF_FIXEDLEN); xpt_announce_periph_sbuf(periph, &sb, announce_buf); xpt_announce_quirks_sbuf(periph, &sb, softc->quirks, ADA_Q_BIT_STRING); sbuf_finish(&sb); sbuf_putbuf(&sb); /* * Create our sysctl variables, now that we know * we have successfully attached. */ if (cam_periph_acquire(periph) == 0) taskqueue_enqueue(taskqueue_thread, &softc->sysctl_task); /* * Add async callbacks for bus reset and * bus device reset calls. I don't bother * checking if this fails as, in most cases, * the system will function just fine without * them and the only alternative would be to * not attach the device on failure. */ xpt_register_async(AC_SENT_BDR | AC_BUS_RESET | AC_LOST_DEVICE | AC_GETDEV_CHANGED | AC_ADVINFO_CHANGED, adaasync, periph, periph->path); /* * Schedule a periodic event to occasionally send an * ordered tag to a device. */ callout_init_mtx(&softc->sendordered_c, cam_periph_mtx(periph), 0); callout_reset(&softc->sendordered_c, (ada_default_timeout * hz) / ADA_ORDEREDTAG_INTERVAL, adasendorderedtag, softc); if (ADA_RA >= 0 && softc->flags & ADA_FLAG_CAN_RAHEAD) { softc->state = ADA_STATE_RAHEAD; } else if (ADA_WC >= 0 && softc->flags & ADA_FLAG_CAN_WCACHE) { softc->state = ADA_STATE_WCACHE; } else if ((softc->flags & ADA_FLAG_CAN_LOG) && (softc->zone_mode != ADA_ZONE_NONE)) { softc->state = ADA_STATE_LOGDIR; } else { /* * Nothing to probe, so we can just transition to the * normal state. */ adaprobedone(periph, NULL); return(CAM_REQ_CMP); } xpt_schedule(periph, CAM_PRIORITY_DEV); return(CAM_REQ_CMP); } static int ada_dsmtrim_req_create(struct ada_softc *softc, struct bio *bp, struct trim_request *req) { uint64_t lastlba = (uint64_t)-1; int c, lastcount = 0, off, ranges = 0; bzero(req, sizeof(*req)); TAILQ_INIT(&req->bps); do { uint64_t lba = bp->bio_pblkno; int count = bp->bio_bcount / softc->params.secsize; /* Try to extend the previous range. */ if (lba == lastlba) { c = min(count, ATA_DSM_RANGE_MAX - lastcount); lastcount += c; off = (ranges - 1) * ATA_DSM_RANGE_SIZE; req->data[off + 6] = lastcount & 0xff; req->data[off + 7] = (lastcount >> 8) & 0xff; count -= c; lba += c; } while (count > 0) { c = min(count, ATA_DSM_RANGE_MAX); off = ranges * ATA_DSM_RANGE_SIZE; req->data[off + 0] = lba & 0xff; req->data[off + 1] = (lba >> 8) & 0xff; req->data[off + 2] = (lba >> 16) & 0xff; req->data[off + 3] = (lba >> 24) & 0xff; req->data[off + 4] = (lba >> 32) & 0xff; req->data[off + 5] = (lba >> 40) & 0xff; req->data[off + 6] = c & 0xff; req->data[off + 7] = (c >> 8) & 0xff; lba += c; count -= c; lastcount = c; ranges++; /* * Its the caller's responsibility to ensure the * request will fit so we don't need to check for * overrun here */ } lastlba = lba; TAILQ_INSERT_TAIL(&req->bps, bp, bio_queue); bp = cam_iosched_next_trim(softc->cam_iosched); if (bp == NULL) break; if (bp->bio_bcount / softc->params.secsize > (softc->trim_max_ranges - ranges) * ATA_DSM_RANGE_MAX) { cam_iosched_put_back_trim(softc->cam_iosched, bp); break; } } while (1); return (ranges); } static void ada_dsmtrim(struct ada_softc *softc, struct bio *bp, struct ccb_ataio *ataio) { struct trim_request *req = &softc->trim_req; int ranges; ranges = ada_dsmtrim_req_create(softc, bp, req); cam_fill_ataio(ataio, ada_retry_count, adadone, CAM_DIR_OUT, 0, req->data, howmany(ranges, ATA_DSM_BLK_RANGES) * ATA_DSM_BLK_SIZE, ada_default_timeout * 1000); ata_48bit_cmd(ataio, ATA_DATA_SET_MANAGEMENT, ATA_DSM_TRIM, 0, howmany(ranges, ATA_DSM_BLK_RANGES)); } static void ada_ncq_dsmtrim(struct ada_softc *softc, struct bio *bp, struct ccb_ataio *ataio) { struct trim_request *req = &softc->trim_req; int ranges; ranges = ada_dsmtrim_req_create(softc, bp, req); cam_fill_ataio(ataio, ada_retry_count, adadone, CAM_DIR_OUT, 0, req->data, howmany(ranges, ATA_DSM_BLK_RANGES) * ATA_DSM_BLK_SIZE, ada_default_timeout * 1000); ata_ncq_cmd(ataio, ATA_SEND_FPDMA_QUEUED, 0, howmany(ranges, ATA_DSM_BLK_RANGES)); ataio->cmd.sector_count_exp = ATA_SFPDMA_DSM; ataio->ata_flags |= ATA_FLAG_AUX; ataio->aux = 1; } static void ada_cfaerase(struct ada_softc *softc, struct bio *bp, struct ccb_ataio *ataio) { struct trim_request *req = &softc->trim_req; uint64_t lba = bp->bio_pblkno; uint16_t count = bp->bio_bcount / softc->params.secsize; bzero(req, sizeof(*req)); TAILQ_INIT(&req->bps); TAILQ_INSERT_TAIL(&req->bps, bp, bio_queue); cam_fill_ataio(ataio, ada_retry_count, adadone, CAM_DIR_NONE, 0, NULL, 0, ada_default_timeout*1000); if (count >= 256) count = 0; ata_28bit_cmd(ataio, ATA_CFA_ERASE, 0, lba, count); } static int ada_zone_bio_to_ata(int disk_zone_cmd) { switch (disk_zone_cmd) { case DISK_ZONE_OPEN: return ATA_ZM_OPEN_ZONE; case DISK_ZONE_CLOSE: return ATA_ZM_CLOSE_ZONE; case DISK_ZONE_FINISH: return ATA_ZM_FINISH_ZONE; case DISK_ZONE_RWP: return ATA_ZM_RWP; } return -1; } static int ada_zone_cmd(struct cam_periph *periph, union ccb *ccb, struct bio *bp, int *queue_ccb) { struct ada_softc *softc; int error; error = 0; if (bp->bio_cmd != BIO_ZONE) { error = EINVAL; goto bailout; } softc = periph->softc; switch (bp->bio_zone.zone_cmd) { case DISK_ZONE_OPEN: case DISK_ZONE_CLOSE: case DISK_ZONE_FINISH: case DISK_ZONE_RWP: { int zone_flags; int zone_sa; uint64_t lba; zone_sa = ada_zone_bio_to_ata(bp->bio_zone.zone_cmd); if (zone_sa == -1) { xpt_print(periph->path, "Cannot translate zone " "cmd %#x to ATA\n", bp->bio_zone.zone_cmd); error = EINVAL; goto bailout; } zone_flags = 0; lba = bp->bio_zone.zone_params.rwp.id; if (bp->bio_zone.zone_params.rwp.flags & DISK_ZONE_RWP_FLAG_ALL) zone_flags |= ZBC_OUT_ALL; ata_zac_mgmt_out(&ccb->ataio, /*retries*/ ada_retry_count, /*cbfcnp*/ adadone, /*use_ncq*/ (softc->flags & ADA_FLAG_PIM_ATA_EXT) ? 1 : 0, /*zm_action*/ zone_sa, /*zone_id*/ lba, /*zone_flags*/ zone_flags, /*sector_count*/ 0, /*data_ptr*/ NULL, /*dxfer_len*/ 0, /*timeout*/ ada_default_timeout * 1000); *queue_ccb = 1; break; } case DISK_ZONE_REPORT_ZONES: { uint8_t *rz_ptr; uint32_t num_entries, alloc_size; struct disk_zone_report *rep; rep = &bp->bio_zone.zone_params.report; num_entries = rep->entries_allocated; if (num_entries == 0) { xpt_print(periph->path, "No entries allocated for " "Report Zones request\n"); error = EINVAL; goto bailout; } alloc_size = sizeof(struct scsi_report_zones_hdr) + (sizeof(struct scsi_report_zones_desc) * num_entries); alloc_size = min(alloc_size, softc->disk->d_maxsize); rz_ptr = malloc(alloc_size, M_ATADA, M_NOWAIT | M_ZERO); if (rz_ptr == NULL) { xpt_print(periph->path, "Unable to allocate memory " "for Report Zones request\n"); error = ENOMEM; goto bailout; } ata_zac_mgmt_in(&ccb->ataio, /*retries*/ ada_retry_count, /*cbcfnp*/ adadone, /*use_ncq*/ (softc->flags & ADA_FLAG_PIM_ATA_EXT) ? 1 : 0, /*zm_action*/ ATA_ZM_REPORT_ZONES, /*zone_id*/ rep->starting_id, /*zone_flags*/ rep->rep_options, /*data_ptr*/ rz_ptr, /*dxfer_len*/ alloc_size, /*timeout*/ ada_default_timeout * 1000); /* * For BIO_ZONE, this isn't normally needed. However, it * is used by devstat_end_transaction_bio() to determine * how much data was transferred. */ /* * XXX KDM we have a problem. But I'm not sure how to fix * it. devstat uses bio_bcount - bio_resid to calculate * the amount of data transferred. The GEOM disk code * uses bio_length - bio_resid to calculate the amount of * data in bio_completed. We have different structure * sizes above and below the ada(4) driver. So, if we * use the sizes above, the amount transferred won't be * quite accurate for devstat. If we use different sizes * for bio_bcount and bio_length (above and below * respectively), then the residual needs to match one or * the other. Everything is calculated after the bio * leaves the driver, so changing the values around isn't * really an option. For now, just set the count to the * passed in length. This means that the calculations * above (e.g. bio_completed) will be correct, but the * amount of data reported to devstat will be slightly * under or overstated. */ bp->bio_bcount = bp->bio_length; *queue_ccb = 1; break; } case DISK_ZONE_GET_PARAMS: { struct disk_zone_disk_params *params; params = &bp->bio_zone.zone_params.disk_params; bzero(params, sizeof(*params)); switch (softc->zone_mode) { case ADA_ZONE_DRIVE_MANAGED: params->zone_mode = DISK_ZONE_MODE_DRIVE_MANAGED; break; case ADA_ZONE_HOST_AWARE: params->zone_mode = DISK_ZONE_MODE_HOST_AWARE; break; case ADA_ZONE_HOST_MANAGED: params->zone_mode = DISK_ZONE_MODE_HOST_MANAGED; break; default: case ADA_ZONE_NONE: params->zone_mode = DISK_ZONE_MODE_NONE; break; } if (softc->zone_flags & ADA_ZONE_FLAG_URSWRZ) params->flags |= DISK_ZONE_DISK_URSWRZ; if (softc->zone_flags & ADA_ZONE_FLAG_OPT_SEQ_SET) { params->optimal_seq_zones = softc->optimal_seq_zones; params->flags |= DISK_ZONE_OPT_SEQ_SET; } if (softc->zone_flags & ADA_ZONE_FLAG_OPT_NONSEQ_SET) { params->optimal_nonseq_zones = softc->optimal_nonseq_zones; params->flags |= DISK_ZONE_OPT_NONSEQ_SET; } if (softc->zone_flags & ADA_ZONE_FLAG_MAX_SEQ_SET) { params->max_seq_zones = softc->max_seq_zones; params->flags |= DISK_ZONE_MAX_SEQ_SET; } if (softc->zone_flags & ADA_ZONE_FLAG_RZ_SUP) params->flags |= DISK_ZONE_RZ_SUP; if (softc->zone_flags & ADA_ZONE_FLAG_OPEN_SUP) params->flags |= DISK_ZONE_OPEN_SUP; if (softc->zone_flags & ADA_ZONE_FLAG_CLOSE_SUP) params->flags |= DISK_ZONE_CLOSE_SUP; if (softc->zone_flags & ADA_ZONE_FLAG_FINISH_SUP) params->flags |= DISK_ZONE_FINISH_SUP; if (softc->zone_flags & ADA_ZONE_FLAG_RWP_SUP) params->flags |= DISK_ZONE_RWP_SUP; break; } default: break; } bailout: return (error); } static void adastart(struct cam_periph *periph, union ccb *start_ccb) { struct ada_softc *softc = (struct ada_softc *)periph->softc; struct ccb_ataio *ataio = &start_ccb->ataio; CAM_DEBUG(periph->path, CAM_DEBUG_TRACE, ("adastart\n")); switch (softc->state) { case ADA_STATE_NORMAL: { struct bio *bp; u_int8_t tag_code; bp = cam_iosched_next_bio(softc->cam_iosched); if (bp == NULL) { xpt_release_ccb(start_ccb); break; } if ((bp->bio_flags & BIO_ORDERED) != 0 || (bp->bio_cmd != BIO_DELETE && (softc->flags & ADA_FLAG_NEED_OTAG) != 0)) { softc->flags &= ~ADA_FLAG_NEED_OTAG; softc->flags |= ADA_FLAG_WAS_OTAG; tag_code = 0; } else { tag_code = 1; } switch (bp->bio_cmd) { case BIO_WRITE: case BIO_READ: { uint64_t lba = bp->bio_pblkno; uint16_t count = bp->bio_bcount / softc->params.secsize; void *data_ptr; int rw_op; if (bp->bio_cmd == BIO_WRITE) { softc->flags |= ADA_FLAG_DIRTY; rw_op = CAM_DIR_OUT; } else { rw_op = CAM_DIR_IN; } data_ptr = bp->bio_data; if ((bp->bio_flags & (BIO_UNMAPPED|BIO_VLIST)) != 0) { rw_op |= CAM_DATA_BIO; data_ptr = bp; } #ifdef ADA_TEST_FAILURE int fail = 0; /* * Support the failure ioctls. If the command is a * read, and there are pending forced read errors, or * if a write and pending write errors, then fail this * operation with EIO. This is useful for testing * purposes. Also, support having every Nth read fail. * * This is a rather blunt tool. */ if (bp->bio_cmd == BIO_READ) { if (softc->force_read_error) { softc->force_read_error--; fail = 1; } if (softc->periodic_read_error > 0) { if (++softc->periodic_read_count >= softc->periodic_read_error) { softc->periodic_read_count = 0; fail = 1; } } } else { if (softc->force_write_error) { softc->force_write_error--; fail = 1; } } if (fail) { biofinish(bp, NULL, EIO); xpt_release_ccb(start_ccb); adaschedule(periph); return; } #endif KASSERT((bp->bio_flags & BIO_UNMAPPED) == 0 || round_page(bp->bio_bcount + bp->bio_ma_offset) / PAGE_SIZE == bp->bio_ma_n, ("Short bio %p", bp)); cam_fill_ataio(ataio, ada_retry_count, adadone, rw_op, 0, data_ptr, bp->bio_bcount, ada_default_timeout*1000); if ((softc->flags & ADA_FLAG_CAN_NCQ) && tag_code) { if (bp->bio_cmd == BIO_READ) { ata_ncq_cmd(ataio, ATA_READ_FPDMA_QUEUED, lba, count); } else { ata_ncq_cmd(ataio, ATA_WRITE_FPDMA_QUEUED, lba, count); } } else if ((softc->flags & ADA_FLAG_CAN_48BIT) && (lba + count >= ATA_MAX_28BIT_LBA || count > 256)) { if (softc->flags & ADA_FLAG_CAN_DMA48) { if (bp->bio_cmd == BIO_READ) { ata_48bit_cmd(ataio, ATA_READ_DMA48, 0, lba, count); } else { ata_48bit_cmd(ataio, ATA_WRITE_DMA48, 0, lba, count); } } else { if (bp->bio_cmd == BIO_READ) { ata_48bit_cmd(ataio, ATA_READ_MUL48, 0, lba, count); } else { ata_48bit_cmd(ataio, ATA_WRITE_MUL48, 0, lba, count); } } } else { if (count == 256) count = 0; if (softc->flags & ADA_FLAG_CAN_DMA) { if (bp->bio_cmd == BIO_READ) { ata_28bit_cmd(ataio, ATA_READ_DMA, 0, lba, count); } else { ata_28bit_cmd(ataio, ATA_WRITE_DMA, 0, lba, count); } } else { if (bp->bio_cmd == BIO_READ) { ata_28bit_cmd(ataio, ATA_READ_MUL, 0, lba, count); } else { ata_28bit_cmd(ataio, ATA_WRITE_MUL, 0, lba, count); } } } break; } case BIO_DELETE: switch (softc->delete_method) { case ADA_DELETE_NCQ_DSM_TRIM: ada_ncq_dsmtrim(softc, bp, ataio); break; case ADA_DELETE_DSM_TRIM: ada_dsmtrim(softc, bp, ataio); break; case ADA_DELETE_CFA_ERASE: ada_cfaerase(softc, bp, ataio); break; default: biofinish(bp, NULL, EOPNOTSUPP); xpt_release_ccb(start_ccb); adaschedule(periph); return; } start_ccb->ccb_h.ccb_state = ADA_CCB_TRIM; start_ccb->ccb_h.flags |= CAM_UNLOCKED; cam_iosched_submit_trim(softc->cam_iosched); goto out; case BIO_FLUSH: cam_fill_ataio(ataio, 1, adadone, CAM_DIR_NONE, 0, NULL, 0, ada_default_timeout*1000); if (softc->flags & ADA_FLAG_CAN_48BIT) ata_48bit_cmd(ataio, ATA_FLUSHCACHE48, 0, 0, 0); else ata_28bit_cmd(ataio, ATA_FLUSHCACHE, 0, 0, 0); break; case BIO_ZONE: { int error, queue_ccb; queue_ccb = 0; error = ada_zone_cmd(periph, start_ccb, bp, &queue_ccb); if ((error != 0) || (queue_ccb == 0)) { biofinish(bp, NULL, error); xpt_release_ccb(start_ccb); return; } break; } } start_ccb->ccb_h.ccb_state = ADA_CCB_BUFFER_IO; start_ccb->ccb_h.flags |= CAM_UNLOCKED; out: start_ccb->ccb_h.ccb_bp = bp; softc->outstanding_cmds++; softc->refcount++; cam_periph_unlock(periph); xpt_action(start_ccb); cam_periph_lock(periph); softc->refcount--; /* May have more work to do, so ensure we stay scheduled */ adaschedule(periph); break; } case ADA_STATE_RAHEAD: case ADA_STATE_WCACHE: { cam_fill_ataio(ataio, 1, adadone, CAM_DIR_NONE, 0, NULL, 0, ada_default_timeout*1000); if (softc->state == ADA_STATE_RAHEAD) { ata_28bit_cmd(ataio, ATA_SETFEATURES, ADA_RA ? ATA_SF_ENAB_RCACHE : ATA_SF_DIS_RCACHE, 0, 0); start_ccb->ccb_h.ccb_state = ADA_CCB_RAHEAD; } else { ata_28bit_cmd(ataio, ATA_SETFEATURES, ADA_WC ? ATA_SF_ENAB_WCACHE : ATA_SF_DIS_WCACHE, 0, 0); start_ccb->ccb_h.ccb_state = ADA_CCB_WCACHE; } start_ccb->ccb_h.flags |= CAM_DEV_QFREEZE; xpt_action(start_ccb); break; } case ADA_STATE_LOGDIR: { struct ata_gp_log_dir *log_dir; if ((softc->flags & ADA_FLAG_CAN_LOG) == 0) { adaprobedone(periph, start_ccb); break; } log_dir = malloc(sizeof(*log_dir), M_ATADA, M_NOWAIT|M_ZERO); if (log_dir == NULL) { xpt_print(periph->path, "Couldn't malloc log_dir " "data\n"); softc->state = ADA_STATE_NORMAL; xpt_release_ccb(start_ccb); break; } ata_read_log(ataio, /*retries*/1, /*cbfcnp*/adadone, /*log_address*/ ATA_LOG_DIRECTORY, /*page_number*/ 0, /*block_count*/ 1, /*protocol*/ softc->flags & ADA_FLAG_CAN_DMA ? CAM_ATAIO_DMA : 0, /*data_ptr*/ (uint8_t *)log_dir, /*dxfer_len*/sizeof(*log_dir), /*timeout*/ada_default_timeout*1000); start_ccb->ccb_h.ccb_state = ADA_CCB_LOGDIR; xpt_action(start_ccb); break; } case ADA_STATE_IDDIR: { struct ata_identify_log_pages *id_dir; id_dir = malloc(sizeof(*id_dir), M_ATADA, M_NOWAIT | M_ZERO); if (id_dir == NULL) { xpt_print(periph->path, "Couldn't malloc id_dir " "data\n"); adaprobedone(periph, start_ccb); break; } ata_read_log(ataio, /*retries*/1, /*cbfcnp*/adadone, /*log_address*/ ATA_IDENTIFY_DATA_LOG, /*page_number*/ ATA_IDL_PAGE_LIST, /*block_count*/ 1, /*protocol*/ softc->flags & ADA_FLAG_CAN_DMA ? CAM_ATAIO_DMA : 0, /*data_ptr*/ (uint8_t *)id_dir, /*dxfer_len*/ sizeof(*id_dir), /*timeout*/ada_default_timeout*1000); start_ccb->ccb_h.ccb_state = ADA_CCB_IDDIR; xpt_action(start_ccb); break; } case ADA_STATE_SUP_CAP: { struct ata_identify_log_sup_cap *sup_cap; sup_cap = malloc(sizeof(*sup_cap), M_ATADA, M_NOWAIT|M_ZERO); if (sup_cap == NULL) { xpt_print(periph->path, "Couldn't malloc sup_cap " "data\n"); adaprobedone(periph, start_ccb); break; } ata_read_log(ataio, /*retries*/1, /*cbfcnp*/adadone, /*log_address*/ ATA_IDENTIFY_DATA_LOG, /*page_number*/ ATA_IDL_SUP_CAP, /*block_count*/ 1, /*protocol*/ softc->flags & ADA_FLAG_CAN_DMA ? CAM_ATAIO_DMA : 0, /*data_ptr*/ (uint8_t *)sup_cap, /*dxfer_len*/ sizeof(*sup_cap), /*timeout*/ada_default_timeout*1000); start_ccb->ccb_h.ccb_state = ADA_CCB_SUP_CAP; xpt_action(start_ccb); break; } case ADA_STATE_ZONE: { struct ata_zoned_info_log *ata_zone; ata_zone = malloc(sizeof(*ata_zone), M_ATADA, M_NOWAIT|M_ZERO); if (ata_zone == NULL) { xpt_print(periph->path, "Couldn't malloc ata_zone " "data\n"); adaprobedone(periph, start_ccb); break; } ata_read_log(ataio, /*retries*/1, /*cbfcnp*/adadone, /*log_address*/ ATA_IDENTIFY_DATA_LOG, /*page_number*/ ATA_IDL_ZDI, /*block_count*/ 1, /*protocol*/ softc->flags & ADA_FLAG_CAN_DMA ? CAM_ATAIO_DMA : 0, /*data_ptr*/ (uint8_t *)ata_zone, /*dxfer_len*/ sizeof(*ata_zone), /*timeout*/ada_default_timeout*1000); start_ccb->ccb_h.ccb_state = ADA_CCB_ZONE; xpt_action(start_ccb); break; } } } static void adaprobedone(struct cam_periph *periph, union ccb *ccb) { struct ada_softc *softc; softc = (struct ada_softc *)periph->softc; if (ccb != NULL) xpt_release_ccb(ccb); softc->state = ADA_STATE_NORMAL; softc->flags |= ADA_FLAG_PROBED; adaschedule(periph); if ((softc->flags & ADA_FLAG_ANNOUNCED) == 0) { softc->flags |= ADA_FLAG_ANNOUNCED; cam_periph_unhold(periph); } else { cam_periph_release_locked(periph); } } static void adazonedone(struct cam_periph *periph, union ccb *ccb) { struct bio *bp; bp = (struct bio *)ccb->ccb_h.ccb_bp; switch (bp->bio_zone.zone_cmd) { case DISK_ZONE_OPEN: case DISK_ZONE_CLOSE: case DISK_ZONE_FINISH: case DISK_ZONE_RWP: break; case DISK_ZONE_REPORT_ZONES: { uint32_t avail_len; struct disk_zone_report *rep; struct scsi_report_zones_hdr *hdr; struct scsi_report_zones_desc *desc; struct disk_zone_rep_entry *entry; uint32_t hdr_len, num_avail; uint32_t num_to_fill, i; rep = &bp->bio_zone.zone_params.report; avail_len = ccb->ataio.dxfer_len - ccb->ataio.resid; /* * Note that bio_resid isn't normally used for zone * commands, but it is used by devstat_end_transaction_bio() * to determine how much data was transferred. Because * the size of the SCSI/ATA data structures is different * than the size of the BIO interface structures, the * amount of data actually transferred from the drive will * be different than the amount of data transferred to * the user. */ hdr = (struct scsi_report_zones_hdr *)ccb->ataio.data_ptr; if (avail_len < sizeof(*hdr)) { /* * Is there a better error than EIO here? We asked * for at least the header, and we got less than * that. */ bp->bio_error = EIO; bp->bio_flags |= BIO_ERROR; bp->bio_resid = bp->bio_bcount; break; } hdr_len = le32dec(hdr->length); if (hdr_len > 0) rep->entries_available = hdr_len / sizeof(*desc); else rep->entries_available = 0; /* * NOTE: using the same values for the BIO version of the * same field as the SCSI/ATA values. This means we could * get some additional values that aren't defined in bio.h * if more values of the same field are defined later. */ rep->header.same = hdr->byte4 & SRZ_SAME_MASK; rep->header.maximum_lba = le64dec(hdr->maximum_lba); /* * If the drive reports no entries that match the query, * we're done. */ if (hdr_len == 0) { rep->entries_filled = 0; bp->bio_resid = bp->bio_bcount; break; } num_avail = min((avail_len - sizeof(*hdr)) / sizeof(*desc), hdr_len / sizeof(*desc)); /* * If the drive didn't return any data, then we're done. */ if (num_avail == 0) { rep->entries_filled = 0; bp->bio_resid = bp->bio_bcount; break; } num_to_fill = min(num_avail, rep->entries_allocated); /* * If the user didn't allocate any entries for us to fill, * we're done. */ if (num_to_fill == 0) { rep->entries_filled = 0; bp->bio_resid = bp->bio_bcount; break; } for (i = 0, desc = &hdr->desc_list[0], entry=&rep->entries[0]; i < num_to_fill; i++, desc++, entry++) { /* * NOTE: we're mapping the values here directly * from the SCSI/ATA bit definitions to the bio.h * definitions. There is also a warning in * disk_zone.h, but the impact is that if * additional values are added in the SCSI/ATA * specs these will be visible to consumers of * this interface. */ entry->zone_type = desc->zone_type & SRZ_TYPE_MASK; entry->zone_condition = (desc->zone_flags & SRZ_ZONE_COND_MASK) >> SRZ_ZONE_COND_SHIFT; entry->zone_flags |= desc->zone_flags & (SRZ_ZONE_NON_SEQ|SRZ_ZONE_RESET); entry->zone_length = le64dec(desc->zone_length); entry->zone_start_lba = le64dec(desc->zone_start_lba); entry->write_pointer_lba = le64dec(desc->write_pointer_lba); } rep->entries_filled = num_to_fill; /* * Note that this residual is accurate from the user's * standpoint, but the amount transferred isn't accurate * from the standpoint of what actually came back from the * drive. */ bp->bio_resid = bp->bio_bcount - (num_to_fill * sizeof(*entry)); break; } case DISK_ZONE_GET_PARAMS: default: /* * In theory we should not get a GET_PARAMS bio, since it * should be handled without queueing the command to the * drive. */ panic("%s: Invalid zone command %d", __func__, bp->bio_zone.zone_cmd); break; } if (bp->bio_zone.zone_cmd == DISK_ZONE_REPORT_ZONES) free(ccb->ataio.data_ptr, M_ATADA); } static void adadone(struct cam_periph *periph, union ccb *done_ccb) { struct ada_softc *softc; struct ccb_ataio *ataio; struct cam_path *path; uint32_t priority; int state; softc = (struct ada_softc *)periph->softc; ataio = &done_ccb->ataio; path = done_ccb->ccb_h.path; priority = done_ccb->ccb_h.pinfo.priority; CAM_DEBUG(path, CAM_DEBUG_TRACE, ("adadone\n")); state = ataio->ccb_h.ccb_state & ADA_CCB_TYPE_MASK; switch (state) { case ADA_CCB_BUFFER_IO: case ADA_CCB_TRIM: { struct bio *bp; int error; cam_periph_lock(periph); bp = (struct bio *)done_ccb->ccb_h.ccb_bp; if ((done_ccb->ccb_h.status & CAM_STATUS_MASK) != CAM_REQ_CMP) { error = adaerror(done_ccb, 0, 0); if (error == ERESTART) { /* A retry was scheduled, so just return. */ cam_periph_unlock(periph); return; } if ((done_ccb->ccb_h.status & CAM_DEV_QFRZN) != 0) cam_release_devq(path, /*relsim_flags*/0, /*reduction*/0, /*timeout*/0, /*getcount_only*/0); /* * If we get an error on an NCQ DSM TRIM, fall back * to a non-NCQ DSM TRIM forever. Please note that if * CAN_NCQ_TRIM is set, CAN_TRIM is necessarily set too. * However, for this one trim, we treat it as advisory * and return success up the stack. */ if (state == ADA_CCB_TRIM && error != 0 && (softc->flags & ADA_FLAG_CAN_NCQ_TRIM) != 0) { softc->flags &= ~ADA_FLAG_CAN_NCQ_TRIM; error = 0; adasetdeletemethod(softc); } } else { if ((done_ccb->ccb_h.status & CAM_DEV_QFRZN) != 0) panic("REQ_CMP with QFRZN"); error = 0; } bp->bio_error = error; if (error != 0) { bp->bio_resid = bp->bio_bcount; bp->bio_flags |= BIO_ERROR; } else { if (bp->bio_cmd == BIO_ZONE) adazonedone(periph, done_ccb); else if (state == ADA_CCB_TRIM) bp->bio_resid = 0; else bp->bio_resid = ataio->resid; if ((bp->bio_resid > 0) && (bp->bio_cmd != BIO_ZONE)) bp->bio_flags |= BIO_ERROR; } softc->outstanding_cmds--; if (softc->outstanding_cmds == 0) softc->flags |= ADA_FLAG_WAS_OTAG; /* * We need to call cam_iosched before we call biodone so that we * don't measure any activity that happens in the completion * routine, which in the case of sendfile can be quite * extensive. */ cam_iosched_bio_complete(softc->cam_iosched, bp, done_ccb); xpt_release_ccb(done_ccb); if (state == ADA_CCB_TRIM) { TAILQ_HEAD(, bio) queue; struct bio *bp1; TAILQ_INIT(&queue); TAILQ_CONCAT(&queue, &softc->trim_req.bps, bio_queue); /* * Normally, the xpt_release_ccb() above would make sure * that when we have more work to do, that work would * get kicked off. However, we specifically keep * trim_running set to 0 before the call above to allow * other I/O to progress when many BIO_DELETE requests * are pushed down. We set trim_running to 0 and call * daschedule again so that we don't stall if there are * no other I/Os pending apart from BIO_DELETEs. */ cam_iosched_trim_done(softc->cam_iosched); adaschedule(periph); cam_periph_unlock(periph); while ((bp1 = TAILQ_FIRST(&queue)) != NULL) { TAILQ_REMOVE(&queue, bp1, bio_queue); bp1->bio_error = error; if (error != 0) { bp1->bio_flags |= BIO_ERROR; bp1->bio_resid = bp1->bio_bcount; } else bp1->bio_resid = 0; biodone(bp1); } } else { adaschedule(periph); cam_periph_unlock(periph); biodone(bp); } return; } case ADA_CCB_RAHEAD: { if ((done_ccb->ccb_h.status & CAM_STATUS_MASK) != CAM_REQ_CMP) { if (adaerror(done_ccb, 0, 0) == ERESTART) { /* Drop freeze taken due to CAM_DEV_QFREEZE */ cam_release_devq(path, 0, 0, 0, FALSE); return; } else if ((done_ccb->ccb_h.status & CAM_DEV_QFRZN) != 0) { cam_release_devq(path, /*relsim_flags*/0, /*reduction*/0, /*timeout*/0, /*getcount_only*/0); } } /* * Since our peripheral may be invalidated by an error * above or an external event, we must release our CCB * before releasing the reference on the peripheral. * The peripheral will only go away once the last reference * is removed, and we need it around for the CCB release * operation. */ xpt_release_ccb(done_ccb); softc->state = ADA_STATE_WCACHE; xpt_schedule(periph, priority); /* Drop freeze taken due to CAM_DEV_QFREEZE */ cam_release_devq(path, 0, 0, 0, FALSE); return; } case ADA_CCB_WCACHE: { if ((done_ccb->ccb_h.status & CAM_STATUS_MASK) != CAM_REQ_CMP) { if (adaerror(done_ccb, 0, 0) == ERESTART) { /* Drop freeze taken due to CAM_DEV_QFREEZE */ cam_release_devq(path, 0, 0, 0, FALSE); return; } else if ((done_ccb->ccb_h.status & CAM_DEV_QFRZN) != 0) { cam_release_devq(path, /*relsim_flags*/0, /*reduction*/0, /*timeout*/0, /*getcount_only*/0); } } /* Drop freeze taken due to CAM_DEV_QFREEZE */ cam_release_devq(path, 0, 0, 0, FALSE); if ((softc->flags & ADA_FLAG_CAN_LOG) && (softc->zone_mode != ADA_ZONE_NONE)) { xpt_release_ccb(done_ccb); softc->state = ADA_STATE_LOGDIR; xpt_schedule(periph, priority); } else { adaprobedone(periph, done_ccb); } return; } case ADA_CCB_LOGDIR: { int error; if ((done_ccb->ccb_h.status & CAM_STATUS_MASK) == CAM_REQ_CMP) { error = 0; softc->valid_logdir_len = 0; bzero(&softc->ata_logdir, sizeof(softc->ata_logdir)); softc->valid_logdir_len = ataio->dxfer_len - ataio->resid; if (softc->valid_logdir_len > 0) bcopy(ataio->data_ptr, &softc->ata_logdir, min(softc->valid_logdir_len, sizeof(softc->ata_logdir))); /* * Figure out whether the Identify Device log is * supported. The General Purpose log directory * has a header, and lists the number of pages * available for each GP log identified by the * offset into the list. */ if ((softc->valid_logdir_len >= ((ATA_IDENTIFY_DATA_LOG + 1) * sizeof(uint16_t))) && (le16dec(softc->ata_logdir.header) == ATA_GP_LOG_DIR_VERSION) && (le16dec(&softc->ata_logdir.num_pages[ (ATA_IDENTIFY_DATA_LOG * sizeof(uint16_t)) - sizeof(uint16_t)]) > 0)){ softc->flags |= ADA_FLAG_CAN_IDLOG; } else { softc->flags &= ~ADA_FLAG_CAN_IDLOG; } } else { error = adaerror(done_ccb, CAM_RETRY_SELTO, SF_RETRY_UA|SF_NO_PRINT); if (error == ERESTART) return; else if (error != 0) { /* * If we can't get the ATA log directory, * then ATA logs are effectively not * supported even if the bit is set in the * identify data. */ softc->flags &= ~(ADA_FLAG_CAN_LOG | ADA_FLAG_CAN_IDLOG); if ((done_ccb->ccb_h.status & CAM_DEV_QFRZN) != 0) { /* Don't wedge this device's queue */ cam_release_devq(done_ccb->ccb_h.path, /*relsim_flags*/0, /*reduction*/0, /*timeout*/0, /*getcount_only*/0); } } } free(ataio->data_ptr, M_ATADA); if ((error == 0) && (softc->flags & ADA_FLAG_CAN_IDLOG)) { softc->state = ADA_STATE_IDDIR; xpt_release_ccb(done_ccb); xpt_schedule(periph, priority); } else adaprobedone(periph, done_ccb); return; } case ADA_CCB_IDDIR: { int error; if ((ataio->ccb_h.status & CAM_STATUS_MASK) == CAM_REQ_CMP) { off_t entries_offset, max_entries; error = 0; softc->valid_iddir_len = 0; bzero(&softc->ata_iddir, sizeof(softc->ata_iddir)); softc->flags &= ~(ADA_FLAG_CAN_SUPCAP | ADA_FLAG_CAN_ZONE); softc->valid_iddir_len = ataio->dxfer_len - ataio->resid; if (softc->valid_iddir_len > 0) bcopy(ataio->data_ptr, &softc->ata_iddir, min(softc->valid_iddir_len, sizeof(softc->ata_iddir))); entries_offset = __offsetof(struct ata_identify_log_pages,entries); max_entries = softc->valid_iddir_len - entries_offset; if ((softc->valid_iddir_len > (entries_offset + 1)) && (le64dec(softc->ata_iddir.header) == ATA_IDLOG_REVISION) && (softc->ata_iddir.entry_count > 0)) { int num_entries, i; num_entries = softc->ata_iddir.entry_count; num_entries = min(num_entries, softc->valid_iddir_len - entries_offset); for (i = 0; i < num_entries && i < max_entries; i++) { if (softc->ata_iddir.entries[i] == ATA_IDL_SUP_CAP) softc->flags |= ADA_FLAG_CAN_SUPCAP; else if (softc->ata_iddir.entries[i]== ATA_IDL_ZDI) softc->flags |= ADA_FLAG_CAN_ZONE; if ((softc->flags & ADA_FLAG_CAN_SUPCAP) && (softc->flags & ADA_FLAG_CAN_ZONE)) break; } } } else { error = adaerror(done_ccb, CAM_RETRY_SELTO, SF_RETRY_UA|SF_NO_PRINT); if (error == ERESTART) return; else if (error != 0) { /* * If we can't get the ATA Identify Data log * directory, then it effectively isn't * supported even if the ATA Log directory * a non-zero number of pages present for * this log. */ softc->flags &= ~ADA_FLAG_CAN_IDLOG; if ((done_ccb->ccb_h.status & CAM_DEV_QFRZN) != 0) { /* Don't wedge this device's queue */ cam_release_devq(done_ccb->ccb_h.path, /*relsim_flags*/0, /*reduction*/0, /*timeout*/0, /*getcount_only*/0); } } } free(ataio->data_ptr, M_ATADA); if ((error == 0) && (softc->flags & ADA_FLAG_CAN_SUPCAP)) { softc->state = ADA_STATE_SUP_CAP; xpt_release_ccb(done_ccb); xpt_schedule(periph, priority); } else adaprobedone(periph, done_ccb); return; } case ADA_CCB_SUP_CAP: { int error; if ((ataio->ccb_h.status & CAM_STATUS_MASK) == CAM_REQ_CMP) { uint32_t valid_len; size_t needed_size; struct ata_identify_log_sup_cap *sup_cap; error = 0; sup_cap = (struct ata_identify_log_sup_cap *) ataio->data_ptr; valid_len = ataio->dxfer_len - ataio->resid; needed_size = __offsetof(struct ata_identify_log_sup_cap, sup_zac_cap) + 1 + sizeof(sup_cap->sup_zac_cap); if (valid_len >= needed_size) { uint64_t zoned, zac_cap; zoned = le64dec(sup_cap->zoned_cap); if (zoned & ATA_ZONED_VALID) { /* * This should have already been * set, because this is also in the * ATA identify data. */ if ((zoned & ATA_ZONED_MASK) == ATA_SUPPORT_ZONE_HOST_AWARE) softc->zone_mode = ADA_ZONE_HOST_AWARE; else if ((zoned & ATA_ZONED_MASK) == ATA_SUPPORT_ZONE_DEV_MANAGED) softc->zone_mode = ADA_ZONE_DRIVE_MANAGED; } zac_cap = le64dec(sup_cap->sup_zac_cap); if (zac_cap & ATA_SUP_ZAC_CAP_VALID) { if (zac_cap & ATA_REPORT_ZONES_SUP) softc->zone_flags |= ADA_ZONE_FLAG_RZ_SUP; if (zac_cap & ATA_ND_OPEN_ZONE_SUP) softc->zone_flags |= ADA_ZONE_FLAG_OPEN_SUP; if (zac_cap & ATA_ND_CLOSE_ZONE_SUP) softc->zone_flags |= ADA_ZONE_FLAG_CLOSE_SUP; if (zac_cap & ATA_ND_FINISH_ZONE_SUP) softc->zone_flags |= ADA_ZONE_FLAG_FINISH_SUP; if (zac_cap & ATA_ND_RWP_SUP) softc->zone_flags |= ADA_ZONE_FLAG_RWP_SUP; } else { /* * This field was introduced in * ACS-4, r08 on April 28th, 2015. * If the drive firmware was written * to an earlier spec, it won't have * the field. So, assume all * commands are supported. */ softc->zone_flags |= ADA_ZONE_FLAG_SUP_MASK; } } } else { error = adaerror(done_ccb, CAM_RETRY_SELTO, SF_RETRY_UA|SF_NO_PRINT); if (error == ERESTART) return; else if (error != 0) { /* * If we can't get the ATA Identify Data * Supported Capabilities page, clear the * flag... */ softc->flags &= ~ADA_FLAG_CAN_SUPCAP; /* * And clear zone capabilities. */ softc->zone_flags &= ~ADA_ZONE_FLAG_SUP_MASK; if ((done_ccb->ccb_h.status & CAM_DEV_QFRZN) != 0) { /* Don't wedge this device's queue */ cam_release_devq(done_ccb->ccb_h.path, /*relsim_flags*/0, /*reduction*/0, /*timeout*/0, /*getcount_only*/0); } } } free(ataio->data_ptr, M_ATADA); if ((error == 0) && (softc->flags & ADA_FLAG_CAN_ZONE)) { softc->state = ADA_STATE_ZONE; xpt_release_ccb(done_ccb); xpt_schedule(periph, priority); } else adaprobedone(periph, done_ccb); return; } case ADA_CCB_ZONE: { int error; if ((ataio->ccb_h.status & CAM_STATUS_MASK) == CAM_REQ_CMP) { struct ata_zoned_info_log *zi_log; uint32_t valid_len; size_t needed_size; zi_log = (struct ata_zoned_info_log *)ataio->data_ptr; valid_len = ataio->dxfer_len - ataio->resid; needed_size = __offsetof(struct ata_zoned_info_log, version_info) + 1 + sizeof(zi_log->version_info); if (valid_len >= needed_size) { uint64_t tmpvar; tmpvar = le64dec(zi_log->zoned_cap); if (tmpvar & ATA_ZDI_CAP_VALID) { if (tmpvar & ATA_ZDI_CAP_URSWRZ) softc->zone_flags |= ADA_ZONE_FLAG_URSWRZ; else softc->zone_flags &= ~ADA_ZONE_FLAG_URSWRZ; } tmpvar = le64dec(zi_log->optimal_seq_zones); if (tmpvar & ATA_ZDI_OPT_SEQ_VALID) { softc->zone_flags |= ADA_ZONE_FLAG_OPT_SEQ_SET; softc->optimal_seq_zones = (tmpvar & ATA_ZDI_OPT_SEQ_MASK); } else { softc->zone_flags &= ~ADA_ZONE_FLAG_OPT_SEQ_SET; softc->optimal_seq_zones = 0; } tmpvar =le64dec(zi_log->optimal_nonseq_zones); if (tmpvar & ATA_ZDI_OPT_NS_VALID) { softc->zone_flags |= ADA_ZONE_FLAG_OPT_NONSEQ_SET; softc->optimal_nonseq_zones = (tmpvar & ATA_ZDI_OPT_NS_MASK); } else { softc->zone_flags &= ~ADA_ZONE_FLAG_OPT_NONSEQ_SET; softc->optimal_nonseq_zones = 0; } tmpvar = le64dec(zi_log->max_seq_req_zones); if (tmpvar & ATA_ZDI_MAX_SEQ_VALID) { softc->zone_flags |= ADA_ZONE_FLAG_MAX_SEQ_SET; softc->max_seq_zones = (tmpvar & ATA_ZDI_MAX_SEQ_MASK); } else { softc->zone_flags &= ~ADA_ZONE_FLAG_MAX_SEQ_SET; softc->max_seq_zones = 0; } } } else { error = adaerror(done_ccb, CAM_RETRY_SELTO, SF_RETRY_UA|SF_NO_PRINT); if (error == ERESTART) return; else if (error != 0) { softc->flags &= ~ADA_FLAG_CAN_ZONE; softc->flags &= ~ADA_ZONE_FLAG_SET_MASK; if ((done_ccb->ccb_h.status & CAM_DEV_QFRZN) != 0) { /* Don't wedge this device's queue */ cam_release_devq(done_ccb->ccb_h.path, /*relsim_flags*/0, /*reduction*/0, /*timeout*/0, /*getcount_only*/0); } } } free(ataio->data_ptr, M_ATADA); adaprobedone(periph, done_ccb); return; } case ADA_CCB_DUMP: /* No-op. We're polling */ return; default: break; } xpt_release_ccb(done_ccb); } static int adaerror(union ccb *ccb, u_int32_t cam_flags, u_int32_t sense_flags) { #ifdef CAM_IO_STATS struct ada_softc *softc; struct cam_periph *periph; periph = xpt_path_periph(ccb->ccb_h.path); softc = (struct ada_softc *)periph->softc; switch (ccb->ccb_h.status & CAM_STATUS_MASK) { case CAM_CMD_TIMEOUT: softc->timeouts++; break; case CAM_REQ_ABORTED: case CAM_REQ_CMP_ERR: case CAM_REQ_TERMIO: case CAM_UNREC_HBA_ERROR: case CAM_DATA_RUN_ERR: case CAM_ATA_STATUS_ERROR: softc->errors++; break; default: break; } #endif return(cam_periph_error(ccb, cam_flags, sense_flags)); } static void adagetparams(struct cam_periph *periph, struct ccb_getdev *cgd) { struct ada_softc *softc = (struct ada_softc *)periph->softc; struct disk_params *dp = &softc->params; u_int64_t lbasize48; u_int32_t lbasize; dp->secsize = ata_logical_sector_size(&cgd->ident_data); if ((cgd->ident_data.atavalid & ATA_FLAG_54_58) && cgd->ident_data.current_heads && cgd->ident_data.current_sectors) { dp->heads = cgd->ident_data.current_heads; dp->secs_per_track = cgd->ident_data.current_sectors; dp->cylinders = cgd->ident_data.cylinders; dp->sectors = (u_int32_t)cgd->ident_data.current_size_1 | ((u_int32_t)cgd->ident_data.current_size_2 << 16); } else { dp->heads = cgd->ident_data.heads; dp->secs_per_track = cgd->ident_data.sectors; dp->cylinders = cgd->ident_data.cylinders; dp->sectors = cgd->ident_data.cylinders * (u_int32_t)(dp->heads * dp->secs_per_track); } lbasize = (u_int32_t)cgd->ident_data.lba_size_1 | ((u_int32_t)cgd->ident_data.lba_size_2 << 16); /* use the 28bit LBA size if valid or bigger than the CHS mapping */ if (cgd->ident_data.cylinders == 16383 || dp->sectors < lbasize) dp->sectors = lbasize; /* use the 48bit LBA size if valid */ lbasize48 = ((u_int64_t)cgd->ident_data.lba_size48_1) | ((u_int64_t)cgd->ident_data.lba_size48_2 << 16) | ((u_int64_t)cgd->ident_data.lba_size48_3 << 32) | ((u_int64_t)cgd->ident_data.lba_size48_4 << 48); if ((cgd->ident_data.support.command2 & ATA_SUPPORT_ADDRESS48) && lbasize48 > ATA_MAX_28BIT_LBA) dp->sectors = lbasize48; } static void adasendorderedtag(void *arg) { struct ada_softc *softc = arg; if (ada_send_ordered) { if (softc->outstanding_cmds > 0) { if ((softc->flags & ADA_FLAG_WAS_OTAG) == 0) softc->flags |= ADA_FLAG_NEED_OTAG; softc->flags &= ~ADA_FLAG_WAS_OTAG; } } /* Queue us up again */ callout_reset(&softc->sendordered_c, (ada_default_timeout * hz) / ADA_ORDEREDTAG_INTERVAL, adasendorderedtag, softc); } /* * Step through all ADA peripheral drivers, and if the device is still open, * sync the disk cache to physical media. */ static void adaflush(void) { struct cam_periph *periph; struct ada_softc *softc; union ccb *ccb; int error; CAM_PERIPH_FOREACH(periph, &adadriver) { softc = (struct ada_softc *)periph->softc; if (SCHEDULER_STOPPED()) { /* If we paniced with the lock held, do not recurse. */ if (!cam_periph_owned(periph) && (softc->flags & ADA_FLAG_OPEN)) { adadump(softc->disk, NULL, 0, 0, 0); } continue; } cam_periph_lock(periph); /* * We only sync the cache if the drive is still open, and * if the drive is capable of it.. */ if (((softc->flags & ADA_FLAG_OPEN) == 0) || (softc->flags & ADA_FLAG_CAN_FLUSHCACHE) == 0) { cam_periph_unlock(periph); continue; } ccb = cam_periph_getccb(periph, CAM_PRIORITY_NORMAL); cam_fill_ataio(&ccb->ataio, 0, adadone, CAM_DIR_NONE, 0, NULL, 0, ada_default_timeout*1000); if (softc->flags & ADA_FLAG_CAN_48BIT) ata_48bit_cmd(&ccb->ataio, ATA_FLUSHCACHE48, 0, 0, 0); else ata_28bit_cmd(&ccb->ataio, ATA_FLUSHCACHE, 0, 0, 0); error = cam_periph_runccb(ccb, adaerror, /*cam_flags*/0, /*sense_flags*/ SF_NO_RECOVERY | SF_NO_RETRY, softc->disk->d_devstat); if (error != 0) xpt_print(periph->path, "Synchronize cache failed\n"); xpt_release_ccb(ccb); cam_periph_unlock(periph); } } static void adaspindown(uint8_t cmd, int flags) { struct cam_periph *periph; struct ada_softc *softc; struct ccb_ataio local_ccb; int error; CAM_PERIPH_FOREACH(periph, &adadriver) { /* If we paniced with lock held - not recurse here. */ if (cam_periph_owned(periph)) continue; cam_periph_lock(periph); softc = (struct ada_softc *)periph->softc; /* * We only spin-down the drive if it is capable of it.. */ if ((softc->flags & ADA_FLAG_CAN_POWERMGT) == 0) { cam_periph_unlock(periph); continue; } if (bootverbose) xpt_print(periph->path, "spin-down\n"); memset(&local_ccb, 0, sizeof(local_ccb)); xpt_setup_ccb(&local_ccb.ccb_h, periph->path, CAM_PRIORITY_NORMAL); local_ccb.ccb_h.ccb_state = ADA_CCB_DUMP; cam_fill_ataio(&local_ccb, 0, adadone, CAM_DIR_NONE | flags, 0, NULL, 0, ada_default_timeout*1000); ata_28bit_cmd(&local_ccb, cmd, 0, 0, 0); error = cam_periph_runccb((union ccb *)&local_ccb, adaerror, /*cam_flags*/0, /*sense_flags*/ SF_NO_RECOVERY | SF_NO_RETRY, softc->disk->d_devstat); if (error != 0) xpt_print(periph->path, "Spin-down disk failed\n"); cam_periph_unlock(periph); } } static void adashutdown(void *arg, int howto) { int how; adaflush(); /* * STANDBY IMMEDIATE saves any volatile data to the drive. It also spins * down hard drives. IDLE IMMEDIATE also saves the volatile data without * a spindown. We send the former when we expect to lose power soon. For * a warm boot, we send the latter to avoid a thundering herd of spinups * just after the kernel loads while probing. We have to do something to * flush the data because the BIOS in many systems resets the HBA * causing a COMINIT/COMRESET negotiation, which some drives interpret * as license to toss the volatile data, and others count as unclean * shutdown when in the Active PM state in SMART attributes. * * adaspindown will ensure that we don't send this to a drive that * doesn't support it. */ if (ada_spindown_shutdown != 0) { how = (howto & (RB_HALT | RB_POWEROFF | RB_POWERCYCLE)) ? ATA_STANDBY_IMMEDIATE : ATA_IDLE_IMMEDIATE; adaspindown(how, 0); } } static void adasuspend(void *arg) { adaflush(); /* * SLEEP also fushes any volatile data, like STANDBY IMEDIATE, * so we don't need to send it as well. */ if (ada_spindown_suspend != 0) adaspindown(ATA_SLEEP, CAM_DEV_QFREEZE); } static void adaresume(void *arg) { struct cam_periph *periph; struct ada_softc *softc; if (ada_spindown_suspend == 0) return; CAM_PERIPH_FOREACH(periph, &adadriver) { cam_periph_lock(periph); softc = (struct ada_softc *)periph->softc; /* * We only spin-down the drive if it is capable of it.. */ if ((softc->flags & ADA_FLAG_CAN_POWERMGT) == 0) { cam_periph_unlock(periph); continue; } if (bootverbose) xpt_print(periph->path, "resume\n"); /* * Drop freeze taken due to CAM_DEV_QFREEZE flag set on * sleep request. */ cam_release_devq(periph->path, /*relsim_flags*/0, /*openings*/0, /*timeout*/0, /*getcount_only*/0); cam_periph_unlock(periph); } } #endif /* _KERNEL */ Index: head/sys/cam/cam_iosched.c =================================================================== --- head/sys/cam/cam_iosched.c (revision 329818) +++ head/sys/cam/cam_iosched.c (revision 329819) @@ -1,1808 +1,1769 @@ /*- * CAM IO Scheduler Interface * * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2015 Netflix, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions, and the following disclaimer, * without modification, immediately at the beginning of the file. * 2. The name of the author may not be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include "opt_cam.h" #include "opt_ddb.h" #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static MALLOC_DEFINE(M_CAMSCHED, "CAM I/O Scheduler", "CAM I/O Scheduler buffers"); /* * Trim or similar currently pending completion. Should only be set for * those drivers wishing only one Trim active at a time. */ #define CAM_IOSCHED_FLAG_TRIM_ACTIVE (1ul << 0) /* Callout active, and needs to be torn down */ #define CAM_IOSCHED_FLAG_CALLOUT_ACTIVE (1ul << 1) /* Timer has just ticked */ #define CAM_IOSCHED_FLAG_TICK (1ul << 2) - /* When set, defer trims until after next tick */ -#define CAM_IOSCHED_FLAG_TRIM_QONLY (1ul << 4) /* Periph drivers set these flags to indicate work */ #define CAM_IOSCHED_FLAG_WORK_FLAGS ((0xffffu) << 16) /* * Default I/O scheduler for FreeBSD. This implementation is just a thin-vineer * over the bioq_* interface, with notions of separate calls for normal I/O and * for trims. * * When CAM_IOSCHED_DYNAMIC is defined, the scheduler is enhanced to dynamically * steer the rate of one type of traffic to help other types of traffic (eg * limit writes when read latency deteriorates on SSDs). */ #ifdef CAM_IOSCHED_DYNAMIC static int do_dynamic_iosched = 1; TUNABLE_INT("kern.cam.do_dynamic_iosched", &do_dynamic_iosched); SYSCTL_INT(_kern_cam, OID_AUTO, do_dynamic_iosched, CTLFLAG_RD, &do_dynamic_iosched, 1, "Enable Dynamic I/O scheduler optimizations."); /* * For an EMA, with an alpha of alpha, we know * alpha = 2 / (N + 1) * or * N = 1 + (2 / alpha) * where N is the number of samples that 86% of the current * EMA is derived from. * * So we invent[*] alpha_bits: * alpha_bits = -log_2(alpha) * alpha = 2^-alpha_bits * So * N = 1 + 2^(alpha_bits + 1) * * The default 9 gives a 1025 lookback for 86% of the data. * For a brief intro: https://en.wikipedia.org/wiki/Moving_average * * [*] Steal from the load average code and many other places. * Note: See computation of EMA and EMVAR for acceptable ranges of alpha. */ static int alpha_bits = 9; TUNABLE_INT("kern.cam.iosched_alpha_bits", &alpha_bits); SYSCTL_INT(_kern_cam, OID_AUTO, iosched_alpha_bits, CTLFLAG_RW, &alpha_bits, 1, "Bits in EMA's alpha."); struct iop_stats; struct cam_iosched_softc; int iosched_debug = 0; typedef enum { none = 0, /* No limits */ queue_depth, /* Limit how many ops we queue to SIM */ iops, /* Limit # of IOPS to the drive */ bandwidth, /* Limit bandwidth to the drive */ limiter_max } io_limiter; static const char *cam_iosched_limiter_names[] = { "none", "queue_depth", "iops", "bandwidth" }; /* * Called to initialize the bits of the iop_stats structure relevant to the * limiter. Called just after the limiter is set. */ typedef int l_init_t(struct iop_stats *); /* * Called every tick. */ typedef int l_tick_t(struct iop_stats *); /* * Called to see if the limiter thinks this IOP can be allowed to * proceed. If so, the limiter assumes that the IOP proceeded * and makes any accounting of it that's needed. */ typedef int l_iop_t(struct iop_stats *, struct bio *); /* * Called when an I/O completes so the limiter can update its * accounting. Pending I/Os may complete in any order (even when * sent to the hardware at the same time), so the limiter may not * make any assumptions other than this I/O has completed. If it * returns 1, then xpt_schedule() needs to be called again. */ typedef int l_iodone_t(struct iop_stats *, struct bio *); static l_iop_t cam_iosched_qd_iop; static l_iop_t cam_iosched_qd_caniop; static l_iodone_t cam_iosched_qd_iodone; static l_init_t cam_iosched_iops_init; static l_tick_t cam_iosched_iops_tick; static l_iop_t cam_iosched_iops_caniop; static l_iop_t cam_iosched_iops_iop; static l_init_t cam_iosched_bw_init; static l_tick_t cam_iosched_bw_tick; static l_iop_t cam_iosched_bw_caniop; static l_iop_t cam_iosched_bw_iop; struct limswitch { l_init_t *l_init; l_tick_t *l_tick; l_iop_t *l_iop; l_iop_t *l_caniop; l_iodone_t *l_iodone; } limsw[] = { { /* none */ .l_init = NULL, .l_tick = NULL, .l_iop = NULL, .l_iodone= NULL, }, { /* queue_depth */ .l_init = NULL, .l_tick = NULL, .l_caniop = cam_iosched_qd_caniop, .l_iop = cam_iosched_qd_iop, .l_iodone= cam_iosched_qd_iodone, }, { /* iops */ .l_init = cam_iosched_iops_init, .l_tick = cam_iosched_iops_tick, .l_caniop = cam_iosched_iops_caniop, .l_iop = cam_iosched_iops_iop, .l_iodone= NULL, }, { /* bandwidth */ .l_init = cam_iosched_bw_init, .l_tick = cam_iosched_bw_tick, .l_caniop = cam_iosched_bw_caniop, .l_iop = cam_iosched_bw_iop, .l_iodone= NULL, }, }; struct iop_stats { /* * sysctl state for this subnode. */ struct sysctl_ctx_list sysctl_ctx; struct sysctl_oid *sysctl_tree; /* * Information about the current rate limiters, if any */ io_limiter limiter; /* How are I/Os being limited */ int min; /* Low range of limit */ int max; /* High range of limit */ int current; /* Current rate limiter */ int l_value1; /* per-limiter scratch value 1. */ int l_value2; /* per-limiter scratch value 2. */ /* * Debug information about counts of I/Os that have gone through the * scheduler. */ int pending; /* I/Os pending in the hardware */ int queued; /* number currently in the queue */ int total; /* Total for all time -- wraps */ int in; /* number queued all time -- wraps */ int out; /* number completed all time -- wraps */ int errs; /* Number of I/Os completed with error -- wraps */ /* * Statistics on different bits of the process. */ /* Exp Moving Average, see alpha_bits for more details */ sbintime_t ema; sbintime_t emvar; sbintime_t sd; /* Last computed sd */ uint32_t state_flags; #define IOP_RATE_LIMITED 1u #define LAT_BUCKETS 15 /* < 1ms < 2ms ... < 2^(n-1)ms >= 2^(n-1)ms*/ uint64_t latencies[LAT_BUCKETS]; struct cam_iosched_softc *softc; }; typedef enum { set_max = 0, /* current = max */ read_latency, /* Steer read latency by throttling writes */ cl_max /* Keep last */ } control_type; static const char *cam_iosched_control_type_names[] = { "set_max", "read_latency" }; struct control_loop { /* * sysctl state for this subnode. */ struct sysctl_ctx_list sysctl_ctx; struct sysctl_oid *sysctl_tree; sbintime_t next_steer; /* Time of next steer */ sbintime_t steer_interval; /* How often do we steer? */ sbintime_t lolat; sbintime_t hilat; int alpha; control_type type; /* What type of control? */ int last_count; /* Last I/O count */ struct cam_iosched_softc *softc; }; #endif struct cam_iosched_softc { struct bio_queue_head bio_queue; struct bio_queue_head trim_queue; /* scheduler flags < 16, user flags >= 16 */ uint32_t flags; - u_int caps; int sort_io_queue; #ifdef CAM_IOSCHED_DYNAMIC int read_bias; /* Read bias setting */ int current_read_bias; /* Current read bias state */ int total_ticks; int load; /* EMA of 'load average' of disk / 2^16 */ struct bio_queue_head write_queue; struct iop_stats read_stats, write_stats, trim_stats; struct sysctl_ctx_list sysctl_ctx; struct sysctl_oid *sysctl_tree; int quanta; /* Number of quanta per second */ struct callout ticker; /* Callout for our quota system */ struct cam_periph *periph; /* cam periph associated with this device */ uint32_t this_frac; /* Fraction of a second (1024ths) for this tick */ sbintime_t last_time; /* Last time we ticked */ struct control_loop cl; #endif }; #ifdef CAM_IOSCHED_DYNAMIC /* * helper functions to call the limsw functions. */ static int cam_iosched_limiter_init(struct iop_stats *ios) { int lim = ios->limiter; /* maybe this should be a kassert */ if (lim < none || lim >= limiter_max) return EINVAL; if (limsw[lim].l_init) return limsw[lim].l_init(ios); return 0; } static int cam_iosched_limiter_tick(struct iop_stats *ios) { int lim = ios->limiter; /* maybe this should be a kassert */ if (lim < none || lim >= limiter_max) return EINVAL; if (limsw[lim].l_tick) return limsw[lim].l_tick(ios); return 0; } static int cam_iosched_limiter_iop(struct iop_stats *ios, struct bio *bp) { int lim = ios->limiter; /* maybe this should be a kassert */ if (lim < none || lim >= limiter_max) return EINVAL; if (limsw[lim].l_iop) return limsw[lim].l_iop(ios, bp); return 0; } static int cam_iosched_limiter_caniop(struct iop_stats *ios, struct bio *bp) { int lim = ios->limiter; /* maybe this should be a kassert */ if (lim < none || lim >= limiter_max) return EINVAL; if (limsw[lim].l_caniop) return limsw[lim].l_caniop(ios, bp); return 0; } static int cam_iosched_limiter_iodone(struct iop_stats *ios, struct bio *bp) { int lim = ios->limiter; /* maybe this should be a kassert */ if (lim < none || lim >= limiter_max) return 0; if (limsw[lim].l_iodone) return limsw[lim].l_iodone(ios, bp); return 0; } /* * Functions to implement the different kinds of limiters */ static int cam_iosched_qd_iop(struct iop_stats *ios, struct bio *bp) { if (ios->current <= 0 || ios->pending < ios->current) return 0; return EAGAIN; } static int cam_iosched_qd_caniop(struct iop_stats *ios, struct bio *bp) { if (ios->current <= 0 || ios->pending < ios->current) return 0; return EAGAIN; } static int cam_iosched_qd_iodone(struct iop_stats *ios, struct bio *bp) { if (ios->current <= 0 || ios->pending != ios->current) return 0; return 1; } static int cam_iosched_iops_init(struct iop_stats *ios) { ios->l_value1 = ios->current / ios->softc->quanta; if (ios->l_value1 <= 0) ios->l_value1 = 1; ios->l_value2 = 0; return 0; } static int cam_iosched_iops_tick(struct iop_stats *ios) { int new_ios; /* * Allow at least one IO per tick until all * the IOs for this interval have been spent. */ new_ios = (int)((ios->current * (uint64_t)ios->softc->this_frac) >> 16); if (new_ios < 1 && ios->l_value2 < ios->current) { new_ios = 1; ios->l_value2++; } /* * If this a new accounting interval, discard any "unspent" ios * granted in the previous interval. Otherwise add the new ios to * the previously granted ones that haven't been spent yet. */ if ((ios->softc->total_ticks % ios->softc->quanta) == 0) { ios->l_value1 = new_ios; ios->l_value2 = 1; } else { ios->l_value1 += new_ios; } return 0; } static int cam_iosched_iops_caniop(struct iop_stats *ios, struct bio *bp) { /* * So if we have any more IOPs left, allow it, * otherwise wait. If current iops is 0, treat that * as unlimited as a failsafe. */ if (ios->current > 0 && ios->l_value1 <= 0) return EAGAIN; return 0; } static int cam_iosched_iops_iop(struct iop_stats *ios, struct bio *bp) { int rv; rv = cam_iosched_limiter_caniop(ios, bp); if (rv == 0) ios->l_value1--; return rv; } static int cam_iosched_bw_init(struct iop_stats *ios) { /* ios->current is in kB/s, so scale to bytes */ ios->l_value1 = ios->current * 1000 / ios->softc->quanta; return 0; } static int cam_iosched_bw_tick(struct iop_stats *ios) { int bw; /* * If we're in the hole for available quota from * the last time, then add the quantum for this. * If we have any left over from last quantum, * then too bad, that's lost. Also, ios->current * is in kB/s, so scale. * * We also allow up to 4 quanta of credits to * accumulate to deal with burstiness. 4 is extremely * arbitrary. */ bw = (int)((ios->current * 1000ull * (uint64_t)ios->softc->this_frac) >> 16); if (ios->l_value1 < bw * 4) ios->l_value1 += bw; return 0; } static int cam_iosched_bw_caniop(struct iop_stats *ios, struct bio *bp) { /* * So if we have any more bw quota left, allow it, * otherwise wait. Note, we'll go negative and that's * OK. We'll just get a little less next quota. * * Note on going negative: that allows us to process * requests in order better, since we won't allow * shorter reads to get around the long one that we * don't have the quota to do just yet. It also prevents * starvation by being a little more permissive about * what we let through this quantum (to prevent the * starvation), at the cost of getting a little less * next quantum. * * Also note that if the current limit is <= 0, * we treat it as unlimited as a failsafe. */ if (ios->current > 0 && ios->l_value1 <= 0) return EAGAIN; return 0; } static int cam_iosched_bw_iop(struct iop_stats *ios, struct bio *bp) { int rv; rv = cam_iosched_limiter_caniop(ios, bp); if (rv == 0) ios->l_value1 -= bp->bio_length; return rv; } static void cam_iosched_cl_maybe_steer(struct control_loop *clp); static void cam_iosched_ticker(void *arg) { struct cam_iosched_softc *isc = arg; sbintime_t now, delta; int pending; callout_reset(&isc->ticker, hz / isc->quanta, cam_iosched_ticker, isc); now = sbinuptime(); delta = now - isc->last_time; isc->this_frac = (uint32_t)delta >> 16; /* Note: discards seconds -- should be 0 harmless if not */ isc->last_time = now; cam_iosched_cl_maybe_steer(&isc->cl); cam_iosched_limiter_tick(&isc->read_stats); cam_iosched_limiter_tick(&isc->write_stats); cam_iosched_limiter_tick(&isc->trim_stats); - isc->flags |= CAM_IOSCHED_FLAG_TICK; + isc->flags |= CAM_IOSCHED_FLAGS_TICK; cam_iosched_schedule(isc, isc->periph); /* * isc->load is an EMA of the pending I/Os at each tick. The number of * pending I/Os is the sum of the I/Os queued to the hardware, and those * in the software queue that could be queued to the hardware if there * were slots. * * ios_stats.pending is a count of requests in the SIM right now for * each of these types of I/O. So the total pending count is the sum of * these I/Os and the sum of the queued I/Os still in the software queue * for those operations that aren't being rate limited at the moment. * * The reason for the rate limiting bit is because those I/Os * aren't part of the software queued load (since we could * give them to hardware, but choose not to). * * Note: due to a bug in counting pending TRIM in the device, we * don't include them in this count. We count each BIO_DELETE in * the pending count, but the periph drivers collapse them down * into one TRIM command. That one trim command gets the completion * so the counts get off. */ pending = isc->read_stats.pending + isc->write_stats.pending /* + isc->trim_stats.pending */; pending += !!(isc->read_stats.state_flags & IOP_RATE_LIMITED) * isc->read_stats.queued + !!(isc->write_stats.state_flags & IOP_RATE_LIMITED) * isc->write_stats.queued /* + !!(isc->trim_stats.state_flags & IOP_RATE_LIMITED) * isc->trim_stats.queued */ ; pending <<= 16; pending /= isc->periph->path->device->ccbq.total_openings; isc->load = (pending + (isc->load << 13) - isc->load) >> 13; /* see above: 13 -> 16139 / 200/s = ~81s ~1 minute */ isc->total_ticks++; } static void cam_iosched_cl_init(struct control_loop *clp, struct cam_iosched_softc *isc) { clp->next_steer = sbinuptime(); clp->softc = isc; clp->steer_interval = SBT_1S * 5; /* Let's start out steering every 5s */ clp->lolat = 5 * SBT_1MS; clp->hilat = 15 * SBT_1MS; clp->alpha = 20; /* Alpha == gain. 20 = .2 */ clp->type = set_max; } static void cam_iosched_cl_maybe_steer(struct control_loop *clp) { struct cam_iosched_softc *isc; sbintime_t now, lat; int old; isc = clp->softc; now = isc->last_time; if (now < clp->next_steer) return; clp->next_steer = now + clp->steer_interval; switch (clp->type) { case set_max: if (isc->write_stats.current != isc->write_stats.max) printf("Steering write from %d kBps to %d kBps\n", isc->write_stats.current, isc->write_stats.max); isc->read_stats.current = isc->read_stats.max; isc->write_stats.current = isc->write_stats.max; isc->trim_stats.current = isc->trim_stats.max; break; case read_latency: old = isc->write_stats.current; lat = isc->read_stats.ema; /* * Simple PLL-like engine. Since we're steering to a range for * the SP (set point) that makes things a little more * complicated. In addition, we're not directly controlling our * PV (process variable), the read latency, but instead are * manipulating the write bandwidth limit for our MV * (manipulation variable), analysis of this code gets a bit * messy. Also, the MV is a very noisy control surface for read * latency since it is affected by many hidden processes inside * the device which change how responsive read latency will be * in reaction to changes in write bandwidth. Unlike the classic * boiler control PLL. this may result in over-steering while * the SSD takes its time to react to the new, lower load. This * is why we use a relatively low alpha of between .1 and .25 to * compensate for this effect. At .1, it takes ~22 steering * intervals to back off by a factor of 10. At .2 it only takes * ~10. At .25 it only takes ~8. However some preliminary data * from the SSD drives suggests a reasponse time in 10's of * seconds before latency drops regardless of the new write * rate. Careful observation will be required to tune this * effectively. * * Also, when there's no read traffic, we jack up the write * limit too regardless of the last read latency. 10 is * somewhat arbitrary. */ if (lat < clp->lolat || isc->read_stats.total - clp->last_count < 10) isc->write_stats.current = isc->write_stats.current * (100 + clp->alpha) / 100; /* Scale up */ else if (lat > clp->hilat) isc->write_stats.current = isc->write_stats.current * (100 - clp->alpha) / 100; /* Scale down */ clp->last_count = isc->read_stats.total; /* * Even if we don't steer, per se, enforce the min/max limits as * those may have changed. */ if (isc->write_stats.current < isc->write_stats.min) isc->write_stats.current = isc->write_stats.min; if (isc->write_stats.current > isc->write_stats.max) isc->write_stats.current = isc->write_stats.max; if (old != isc->write_stats.current && iosched_debug) printf("Steering write from %d kBps to %d kBps due to latency of %jdus\n", old, isc->write_stats.current, (uintmax_t)((uint64_t)1000000 * (uint32_t)lat) >> 32); break; case cl_max: break; } } #endif #ifdef CAM_IOSCHED_DYNAMIC static void cam_iosched_io_metric_update(struct cam_iosched_softc *isc, sbintime_t sim_latency, int cmd, size_t size); #endif static inline int cam_iosched_has_flagged_work(struct cam_iosched_softc *isc) { return !!(isc->flags & CAM_IOSCHED_FLAG_WORK_FLAGS); } static inline int cam_iosched_has_io(struct cam_iosched_softc *isc) { #ifdef CAM_IOSCHED_DYNAMIC if (do_dynamic_iosched) { struct bio *rbp = bioq_first(&isc->bio_queue); struct bio *wbp = bioq_first(&isc->write_queue); int can_write = wbp != NULL && cam_iosched_limiter_caniop(&isc->write_stats, wbp) == 0; int can_read = rbp != NULL && cam_iosched_limiter_caniop(&isc->read_stats, rbp) == 0; if (iosched_debug > 2) { printf("can write %d: pending_writes %d max_writes %d\n", can_write, isc->write_stats.pending, isc->write_stats.max); printf("can read %d: read_stats.pending %d max_reads %d\n", can_read, isc->read_stats.pending, isc->read_stats.max); printf("Queued reads %d writes %d\n", isc->read_stats.queued, isc->write_stats.queued); } return can_read || can_write; } #endif return bioq_first(&isc->bio_queue) != NULL; } static inline int cam_iosched_has_more_trim(struct cam_iosched_softc *isc) { return !(isc->flags & CAM_IOSCHED_FLAG_TRIM_ACTIVE) && bioq_first(&isc->trim_queue); } #define cam_iosched_sort_queue(isc) ((isc)->sort_io_queue >= 0 ? \ (isc)->sort_io_queue : cam_sort_io_queues) static inline int cam_iosched_has_work(struct cam_iosched_softc *isc) { #ifdef CAM_IOSCHED_DYNAMIC if (iosched_debug > 2) printf("has work: %d %d %d\n", cam_iosched_has_io(isc), cam_iosched_has_more_trim(isc), cam_iosched_has_flagged_work(isc)); #endif return cam_iosched_has_io(isc) || cam_iosched_has_more_trim(isc) || cam_iosched_has_flagged_work(isc); } #ifdef CAM_IOSCHED_DYNAMIC static void cam_iosched_iop_stats_init(struct cam_iosched_softc *isc, struct iop_stats *ios) { ios->limiter = none; ios->in = 0; ios->max = ios->current = 300000; ios->min = 1; ios->out = 0; ios->errs = 0; ios->pending = 0; ios->queued = 0; ios->total = 0; ios->ema = 0; ios->emvar = 0; ios->softc = isc; cam_iosched_limiter_init(ios); } static int cam_iosched_limiter_sysctl(SYSCTL_HANDLER_ARGS) { char buf[16]; struct iop_stats *ios; struct cam_iosched_softc *isc; int value, i, error; const char *p; ios = arg1; isc = ios->softc; value = ios->limiter; if (value < none || value >= limiter_max) p = "UNKNOWN"; else p = cam_iosched_limiter_names[value]; strlcpy(buf, p, sizeof(buf)); error = sysctl_handle_string(oidp, buf, sizeof(buf), req); if (error != 0 || req->newptr == NULL) return error; cam_periph_lock(isc->periph); for (i = none; i < limiter_max; i++) { if (strcmp(buf, cam_iosched_limiter_names[i]) != 0) continue; ios->limiter = i; error = cam_iosched_limiter_init(ios); if (error != 0) { ios->limiter = value; cam_periph_unlock(isc->periph); return error; } /* Note: disk load averate requires ticker to be always running */ callout_reset(&isc->ticker, hz / isc->quanta, cam_iosched_ticker, isc); isc->flags |= CAM_IOSCHED_FLAG_CALLOUT_ACTIVE; cam_periph_unlock(isc->periph); return 0; } cam_periph_unlock(isc->periph); return EINVAL; } static int cam_iosched_control_type_sysctl(SYSCTL_HANDLER_ARGS) { char buf[16]; struct control_loop *clp; struct cam_iosched_softc *isc; int value, i, error; const char *p; clp = arg1; isc = clp->softc; value = clp->type; if (value < none || value >= cl_max) p = "UNKNOWN"; else p = cam_iosched_control_type_names[value]; strlcpy(buf, p, sizeof(buf)); error = sysctl_handle_string(oidp, buf, sizeof(buf), req); if (error != 0 || req->newptr == NULL) return error; for (i = set_max; i < cl_max; i++) { if (strcmp(buf, cam_iosched_control_type_names[i]) != 0) continue; cam_periph_lock(isc->periph); clp->type = i; cam_periph_unlock(isc->periph); return 0; } return EINVAL; } static int cam_iosched_sbintime_sysctl(SYSCTL_HANDLER_ARGS) { char buf[16]; sbintime_t value; int error; uint64_t us; value = *(sbintime_t *)arg1; us = (uint64_t)value / SBT_1US; snprintf(buf, sizeof(buf), "%ju", (intmax_t)us); error = sysctl_handle_string(oidp, buf, sizeof(buf), req); if (error != 0 || req->newptr == NULL) return error; us = strtoul(buf, NULL, 10); if (us == 0) return EINVAL; *(sbintime_t *)arg1 = us * SBT_1US; return 0; } static int cam_iosched_sysctl_latencies(SYSCTL_HANDLER_ARGS) { int i, error; struct sbuf sb; uint64_t *latencies; latencies = arg1; sbuf_new_for_sysctl(&sb, NULL, LAT_BUCKETS * 16, req); for (i = 0; i < LAT_BUCKETS - 1; i++) sbuf_printf(&sb, "%jd,", (intmax_t)latencies[i]); sbuf_printf(&sb, "%jd", (intmax_t)latencies[LAT_BUCKETS - 1]); error = sbuf_finish(&sb); sbuf_delete(&sb); return (error); } static int cam_iosched_quanta_sysctl(SYSCTL_HANDLER_ARGS) { int *quanta; int error, value; quanta = (unsigned *)arg1; value = *quanta; error = sysctl_handle_int(oidp, (int *)&value, 0, req); if ((error != 0) || (req->newptr == NULL)) return (error); if (value < 1 || value > hz) return (EINVAL); *quanta = value; return (0); } static void cam_iosched_iop_stats_sysctl_init(struct cam_iosched_softc *isc, struct iop_stats *ios, char *name) { struct sysctl_oid_list *n; struct sysctl_ctx_list *ctx; ios->sysctl_tree = SYSCTL_ADD_NODE(&isc->sysctl_ctx, SYSCTL_CHILDREN(isc->sysctl_tree), OID_AUTO, name, CTLFLAG_RD, 0, name); n = SYSCTL_CHILDREN(ios->sysctl_tree); ctx = &ios->sysctl_ctx; SYSCTL_ADD_UQUAD(ctx, n, OID_AUTO, "ema", CTLFLAG_RD, &ios->ema, "Fast Exponentially Weighted Moving Average"); SYSCTL_ADD_UQUAD(ctx, n, OID_AUTO, "emvar", CTLFLAG_RD, &ios->emvar, "Fast Exponentially Weighted Moving Variance"); SYSCTL_ADD_INT(ctx, n, OID_AUTO, "pending", CTLFLAG_RD, &ios->pending, 0, "Instantaneous # of pending transactions"); SYSCTL_ADD_INT(ctx, n, OID_AUTO, "count", CTLFLAG_RD, &ios->total, 0, "# of transactions submitted to hardware"); SYSCTL_ADD_INT(ctx, n, OID_AUTO, "queued", CTLFLAG_RD, &ios->queued, 0, "# of transactions in the queue"); SYSCTL_ADD_INT(ctx, n, OID_AUTO, "in", CTLFLAG_RD, &ios->in, 0, "# of transactions queued to driver"); SYSCTL_ADD_INT(ctx, n, OID_AUTO, "out", CTLFLAG_RD, &ios->out, 0, "# of transactions completed (including with error)"); SYSCTL_ADD_INT(ctx, n, OID_AUTO, "errs", CTLFLAG_RD, &ios->errs, 0, "# of transactions completed with an error"); SYSCTL_ADD_PROC(ctx, n, OID_AUTO, "limiter", CTLTYPE_STRING | CTLFLAG_RW, ios, 0, cam_iosched_limiter_sysctl, "A", "Current limiting type."); SYSCTL_ADD_INT(ctx, n, OID_AUTO, "min", CTLFLAG_RW, &ios->min, 0, "min resource"); SYSCTL_ADD_INT(ctx, n, OID_AUTO, "max", CTLFLAG_RW, &ios->max, 0, "max resource"); SYSCTL_ADD_INT(ctx, n, OID_AUTO, "current", CTLFLAG_RW, &ios->current, 0, "current resource"); SYSCTL_ADD_PROC(ctx, n, OID_AUTO, "latencies", CTLTYPE_STRING | CTLFLAG_RD, &ios->latencies, 0, cam_iosched_sysctl_latencies, "A", "Array of power of 2 latency from 1ms to 1.024s"); } static void cam_iosched_iop_stats_fini(struct iop_stats *ios) { if (ios->sysctl_tree) if (sysctl_ctx_free(&ios->sysctl_ctx) != 0) printf("can't remove iosched sysctl stats context\n"); } static void cam_iosched_cl_sysctl_init(struct cam_iosched_softc *isc) { struct sysctl_oid_list *n; struct sysctl_ctx_list *ctx; struct control_loop *clp; clp = &isc->cl; clp->sysctl_tree = SYSCTL_ADD_NODE(&isc->sysctl_ctx, SYSCTL_CHILDREN(isc->sysctl_tree), OID_AUTO, "control", CTLFLAG_RD, 0, "Control loop info"); n = SYSCTL_CHILDREN(clp->sysctl_tree); ctx = &clp->sysctl_ctx; SYSCTL_ADD_PROC(ctx, n, OID_AUTO, "type", CTLTYPE_STRING | CTLFLAG_RW, clp, 0, cam_iosched_control_type_sysctl, "A", "Control loop algorithm"); SYSCTL_ADD_PROC(ctx, n, OID_AUTO, "steer_interval", CTLTYPE_STRING | CTLFLAG_RW, &clp->steer_interval, 0, cam_iosched_sbintime_sysctl, "A", "How often to steer (in us)"); SYSCTL_ADD_PROC(ctx, n, OID_AUTO, "lolat", CTLTYPE_STRING | CTLFLAG_RW, &clp->lolat, 0, cam_iosched_sbintime_sysctl, "A", "Low water mark for Latency (in us)"); SYSCTL_ADD_PROC(ctx, n, OID_AUTO, "hilat", CTLTYPE_STRING | CTLFLAG_RW, &clp->hilat, 0, cam_iosched_sbintime_sysctl, "A", "Hi water mark for Latency (in us)"); SYSCTL_ADD_INT(ctx, n, OID_AUTO, "alpha", CTLFLAG_RW, &clp->alpha, 0, "Alpha for PLL (x100) aka gain"); } static void cam_iosched_cl_sysctl_fini(struct control_loop *clp) { if (clp->sysctl_tree) if (sysctl_ctx_free(&clp->sysctl_ctx) != 0) printf("can't remove iosched sysctl control loop context\n"); } #endif /* * Allocate the iosched structure. This also insulates callers from knowing * sizeof struct cam_iosched_softc. */ int -cam_iosched_init(struct cam_iosched_softc **iscp, struct cam_periph *periph, - u_int caps) +cam_iosched_init(struct cam_iosched_softc **iscp, struct cam_periph *periph) { *iscp = malloc(sizeof(**iscp), M_CAMSCHED, M_NOWAIT | M_ZERO); if (*iscp == NULL) return ENOMEM; - (*iscp)->caps = caps; - if (caps & CAM_IOSCHED_CAP_TRIM_CLOCKED) - (*iscp)->flags |= CAM_IOSCHED_FLAG_TRIM_QONLY; #ifdef CAM_IOSCHED_DYNAMIC if (iosched_debug) printf("CAM IOSCHEDULER Allocating entry at %p\n", *iscp); #endif (*iscp)->sort_io_queue = -1; bioq_init(&(*iscp)->bio_queue); bioq_init(&(*iscp)->trim_queue); #ifdef CAM_IOSCHED_DYNAMIC if (do_dynamic_iosched) { bioq_init(&(*iscp)->write_queue); (*iscp)->read_bias = 100; (*iscp)->current_read_bias = 100; (*iscp)->quanta = min(hz, 200); cam_iosched_iop_stats_init(*iscp, &(*iscp)->read_stats); cam_iosched_iop_stats_init(*iscp, &(*iscp)->write_stats); cam_iosched_iop_stats_init(*iscp, &(*iscp)->trim_stats); (*iscp)->trim_stats.max = 1; /* Trims are special: one at a time for now */ (*iscp)->last_time = sbinuptime(); callout_init_mtx(&(*iscp)->ticker, cam_periph_mtx(periph), 0); (*iscp)->periph = periph; cam_iosched_cl_init(&(*iscp)->cl, *iscp); callout_reset(&(*iscp)->ticker, hz / (*iscp)->quanta, cam_iosched_ticker, *iscp); (*iscp)->flags |= CAM_IOSCHED_FLAG_CALLOUT_ACTIVE; } #endif return 0; } /* * Reclaim all used resources. This assumes that other folks have * drained the requests in the hardware. Maybe an unwise assumption. */ void cam_iosched_fini(struct cam_iosched_softc *isc) { if (isc) { cam_iosched_flush(isc, NULL, ENXIO); #ifdef CAM_IOSCHED_DYNAMIC cam_iosched_iop_stats_fini(&isc->read_stats); cam_iosched_iop_stats_fini(&isc->write_stats); cam_iosched_iop_stats_fini(&isc->trim_stats); cam_iosched_cl_sysctl_fini(&isc->cl); if (isc->sysctl_tree) if (sysctl_ctx_free(&isc->sysctl_ctx) != 0) printf("can't remove iosched sysctl stats context\n"); if (isc->flags & CAM_IOSCHED_FLAG_CALLOUT_ACTIVE) { callout_drain(&isc->ticker); isc->flags &= ~ CAM_IOSCHED_FLAG_CALLOUT_ACTIVE; } #endif free(isc, M_CAMSCHED); } } /* * After we're sure we're attaching a device, go ahead and add * hooks for any sysctl we may wish to honor. */ void cam_iosched_sysctl_init(struct cam_iosched_softc *isc, struct sysctl_ctx_list *ctx, struct sysctl_oid *node) { #ifdef CAM_IOSCHED_DYNAMIC struct sysctl_oid_list *n; #endif SYSCTL_ADD_INT(ctx, SYSCTL_CHILDREN(node), OID_AUTO, "sort_io_queue", CTLFLAG_RW | CTLFLAG_MPSAFE, &isc->sort_io_queue, 0, "Sort IO queue to try and optimise disk access patterns"); #ifdef CAM_IOSCHED_DYNAMIC if (!do_dynamic_iosched) return; isc->sysctl_tree = SYSCTL_ADD_NODE(&isc->sysctl_ctx, SYSCTL_CHILDREN(node), OID_AUTO, "iosched", CTLFLAG_RD, 0, "I/O scheduler statistics"); n = SYSCTL_CHILDREN(isc->sysctl_tree); ctx = &isc->sysctl_ctx; cam_iosched_iop_stats_sysctl_init(isc, &isc->read_stats, "read"); cam_iosched_iop_stats_sysctl_init(isc, &isc->write_stats, "write"); cam_iosched_iop_stats_sysctl_init(isc, &isc->trim_stats, "trim"); cam_iosched_cl_sysctl_init(isc); SYSCTL_ADD_INT(ctx, n, OID_AUTO, "read_bias", CTLFLAG_RW, &isc->read_bias, 100, "How biased towards read should we be independent of limits"); SYSCTL_ADD_PROC(ctx, n, OID_AUTO, "quanta", CTLTYPE_UINT | CTLFLAG_RW, &isc->quanta, 0, cam_iosched_quanta_sysctl, "I", "How many quanta per second do we slice the I/O up into"); SYSCTL_ADD_INT(ctx, n, OID_AUTO, "total_ticks", CTLFLAG_RD, &isc->total_ticks, 0, "Total number of ticks we've done"); SYSCTL_ADD_INT(ctx, n, OID_AUTO, "load", CTLFLAG_RD, &isc->load, 0, "scaled load average / 100"); #endif } /* * Flush outstanding I/O. Consumers of this library don't know all the * queues we may keep, so this allows all I/O to be flushed in one * convenient call. */ void cam_iosched_flush(struct cam_iosched_softc *isc, struct devstat *stp, int err) { bioq_flush(&isc->bio_queue, stp, err); bioq_flush(&isc->trim_queue, stp, err); #ifdef CAM_IOSCHED_DYNAMIC if (do_dynamic_iosched) bioq_flush(&isc->write_queue, stp, err); #endif } #ifdef CAM_IOSCHED_DYNAMIC static struct bio * -cam_iosched_get_write(struct cam_iosched_softc *isc, bool wastick) +cam_iosched_get_write(struct cam_iosched_softc *isc) { struct bio *bp; /* * We control the write rate by controlling how many requests we send * down to the drive at any one time. Fewer requests limits the * effects of both starvation when the requests take a while and write * amplification when each request is causing more than one write to * the NAND media. Limiting the queue depth like this will also limit * the write throughput and give and reads that want to compete to * compete unfairly. */ bp = bioq_first(&isc->write_queue); if (bp == NULL) { if (iosched_debug > 3) printf("No writes present in write_queue\n"); return NULL; } /* * If pending read, prefer that based on current read bias * setting. */ if (bioq_first(&isc->bio_queue) && isc->current_read_bias) { if (iosched_debug) printf( "Reads present and current_read_bias is %d queued " "writes %d queued reads %d\n", isc->current_read_bias, isc->write_stats.queued, isc->read_stats.queued); isc->current_read_bias--; /* We're not limiting writes, per se, just doing reads first */ return NULL; } /* * See if our current limiter allows this I/O. */ if (cam_iosched_limiter_iop(&isc->write_stats, bp) != 0) { if (iosched_debug) printf("Can't write because limiter says no.\n"); isc->write_stats.state_flags |= IOP_RATE_LIMITED; return NULL; } /* * Let's do this: We've passed all the gates and we're a go * to schedule the I/O in the SIM. */ isc->current_read_bias = isc->read_bias; bioq_remove(&isc->write_queue, bp); if (bp->bio_cmd == BIO_WRITE) { isc->write_stats.queued--; isc->write_stats.total++; isc->write_stats.pending++; } if (iosched_debug > 9) printf("HWQ : %p %#x\n", bp, bp->bio_cmd); isc->write_stats.state_flags &= ~IOP_RATE_LIMITED; return bp; } #endif /* * Put back a trim that you weren't able to actually schedule this time. */ void cam_iosched_put_back_trim(struct cam_iosched_softc *isc, struct bio *bp) { bioq_insert_head(&isc->trim_queue, bp); #ifdef CAM_IOSCHED_DYNAMIC isc->trim_stats.queued++; isc->trim_stats.total--; /* since we put it back, don't double count */ isc->trim_stats.pending--; #endif } /* * gets the next trim from the trim queue. * * Assumes we're called with the periph lock held. It removes this * trim from the queue and the device must explicitly reinsert it * should the need arise. */ struct bio * cam_iosched_next_trim(struct cam_iosched_softc *isc) { struct bio *bp; bp = bioq_first(&isc->trim_queue); if (bp == NULL) return NULL; bioq_remove(&isc->trim_queue, bp); #ifdef CAM_IOSCHED_DYNAMIC isc->trim_stats.queued--; isc->trim_stats.total++; isc->trim_stats.pending++; #endif return bp; } /* * gets an available trim from the trim queue, if there's no trim * already pending. It removes this trim from the queue and the device * must explicitly reinsert it should the need arise. * * Assumes we're called with the periph lock held. */ -static struct bio * -cam_iosched_get_trim(struct cam_iosched_softc *isc, bool wastick) +struct bio * +cam_iosched_get_trim(struct cam_iosched_softc *isc) { - /* - * If there's no trims, return NULL. If we're clocking out the - * trims rather than doing thins right away, this is where we - * set the queue only bit. This causes us to ignore them until - * the next clock tick. If we can't get a trim, and we're clocking - * them out, if the queue is empty or if we're rate limited, - * then set QONLY so we stop processing trims until the next - * tick. - */ - if (!cam_iosched_has_more_trim(isc)) { - if ((isc->caps & CAM_IOSCHED_CAP_TRIM_CLOCKED) && - (bioq_first(&isc->trim_queue) == NULL || -#ifdef CAM_IOSCHED_DYNAMIC - (isc->trim_stats.state_flags & IOP_RATE_LIMITED) -#else - false -#endif - )) - isc->flags |= CAM_IOSCHED_FLAG_TRIM_QONLY; + if (!cam_iosched_has_more_trim(isc)) return NULL; - } - /* - * If we just ticked, and we have trims, then turn off - * the queue only flag. - */ - if (wastick) - isc->flags &= ~CAM_IOSCHED_FLAG_TRIM_QONLY; - - /* - * If QONLY is set, no trims are eligble just now. - */ - if (isc->flags & CAM_IOSCHED_FLAG_TRIM_QONLY) - return NULL; - return cam_iosched_next_trim(isc); } /* * Determine what the next bit of work to do is for the periph. The * default implementation looks to see if we have trims to do, but no * trims outstanding. If so, we do that. Otherwise we see if we have * other work. If we do, then we do that. Otherwise why were we called? */ struct bio * cam_iosched_next_bio(struct cam_iosched_softc *isc) { struct bio *bp; bool wastick; - wastick = !!(isc->flags & CAM_IOSCHED_FLAG_TICK); - isc->flags &= ~CAM_IOSCHED_FLAG_TICK; + wastick = !!(isc->flags & CAM_IOSCHED_FLAGS_TICK); + isc->flags &= ~CAM_IOSCHED_FLAGS_TICK; /* * See if we have a trim that can be scheduled. We can only send one - * at a time down, so this takes that into account for those devices - * that can only do one. In addition, some devices queue up a bunch - * of TRIMs before sending them down as a batch. + * at a time down, so this takes that into account. + * + * XXX newer TRIM commands are queueable. Revisit this when we + * implement them. */ - if ((isc->flags & CAM_IOSCHED_FLAG_TRIM_QONLY) == 0 && - (bp = cam_iosched_get_trim(isc, wastick)) != NULL) + if ((bp = cam_iosched_get_trim(isc)) != NULL) return bp; #ifdef CAM_IOSCHED_DYNAMIC /* * See if we have any pending writes, and room in the queue for them, * and if so, those are next. */ if (do_dynamic_iosched) { - if ((bp = cam_iosched_get_write(isc, was_tick)) != NULL) + if ((bp = cam_iosched_get_write(isc)) != NULL) return bp; } #endif /* * next, see if there's other, normal I/O waiting. If so return that. */ if ((bp = bioq_first(&isc->bio_queue)) == NULL) return NULL; #ifdef CAM_IOSCHED_DYNAMIC /* * For the dynamic scheduler, bio_queue is only for reads, so enforce * the limits here. Enforce only for reads. */ if (do_dynamic_iosched) { if (bp->bio_cmd == BIO_READ && cam_iosched_limiter_iop(&isc->read_stats, bp) != 0) { isc->read_stats.state_flags |= IOP_RATE_LIMITED; return NULL; } } isc->read_stats.state_flags &= ~IOP_RATE_LIMITED; #endif bioq_remove(&isc->bio_queue, bp); #ifdef CAM_IOSCHED_DYNAMIC if (do_dynamic_iosched) { if (bp->bio_cmd == BIO_READ) { isc->read_stats.queued--; isc->read_stats.total++; isc->read_stats.pending++; } else printf("Found bio_cmd = %#x\n", bp->bio_cmd); } if (iosched_debug > 9) printf("HWQ : %p %#x\n", bp, bp->bio_cmd); #endif return bp; } /* * Driver has been given some work to do by the block layer. Tell the * scheduler about it and have it queue the work up. The scheduler module * will then return the currently most useful bit of work later, possibly * deferring work for various reasons. */ void cam_iosched_queue_work(struct cam_iosched_softc *isc, struct bio *bp) { /* * Put all trims on the trim queue sorted, since we know * that the collapsing code requires this. Otherwise put * the work on the bio queue. */ if (bp->bio_cmd == BIO_DELETE) { bioq_insert_tail(&isc->trim_queue, bp); #ifdef CAM_IOSCHED_DYNAMIC isc->trim_stats.in++; isc->trim_stats.queued++; #endif } #ifdef CAM_IOSCHED_DYNAMIC else if (do_dynamic_iosched && (bp->bio_cmd != BIO_READ)) { if (cam_iosched_sort_queue(isc)) bioq_disksort(&isc->write_queue, bp); else bioq_insert_tail(&isc->write_queue, bp); if (iosched_debug > 9) printf("Qw : %p %#x\n", bp, bp->bio_cmd); if (bp->bio_cmd == BIO_WRITE) { isc->write_stats.in++; isc->write_stats.queued++; } } #endif else { if (cam_iosched_sort_queue(isc)) bioq_disksort(&isc->bio_queue, bp); else bioq_insert_tail(&isc->bio_queue, bp); #ifdef CAM_IOSCHED_DYNAMIC if (iosched_debug > 9) printf("Qr : %p %#x\n", bp, bp->bio_cmd); if (bp->bio_cmd == BIO_READ) { isc->read_stats.in++; isc->read_stats.queued++; } else if (bp->bio_cmd == BIO_WRITE) { isc->write_stats.in++; isc->write_stats.queued++; } #endif } } /* * If we have work, get it scheduled. Called with the periph lock held. */ void cam_iosched_schedule(struct cam_iosched_softc *isc, struct cam_periph *periph) { if (cam_iosched_has_work(isc)) xpt_schedule(periph, CAM_PRIORITY_NORMAL); } /* * Complete a trim request. Mark that we no longer have one in flight. */ void cam_iosched_trim_done(struct cam_iosched_softc *isc) { isc->flags &= ~CAM_IOSCHED_FLAG_TRIM_ACTIVE; } /* * Complete a bio. Called before we release the ccb with xpt_release_ccb so we * might use notes in the ccb for statistics. */ int cam_iosched_bio_complete(struct cam_iosched_softc *isc, struct bio *bp, union ccb *done_ccb) { int retval = 0; #ifdef CAM_IOSCHED_DYNAMIC if (!do_dynamic_iosched) return retval; if (iosched_debug > 10) printf("done: %p %#x\n", bp, bp->bio_cmd); if (bp->bio_cmd == BIO_WRITE) { retval = cam_iosched_limiter_iodone(&isc->write_stats, bp); if (!(bp->bio_flags & BIO_ERROR)) isc->write_stats.errs++; isc->write_stats.out++; isc->write_stats.pending--; } else if (bp->bio_cmd == BIO_READ) { retval = cam_iosched_limiter_iodone(&isc->read_stats, bp); if (!(bp->bio_flags & BIO_ERROR)) isc->read_stats.errs++; isc->read_stats.out++; isc->read_stats.pending--; } else if (bp->bio_cmd == BIO_DELETE) { if (!(bp->bio_flags & BIO_ERROR)) isc->trim_stats.errs++; isc->trim_stats.out++; isc->trim_stats.pending--; } else if (bp->bio_cmd != BIO_FLUSH) { if (iosched_debug) printf("Completing command with bio_cmd == %#x\n", bp->bio_cmd); } if (!(bp->bio_flags & BIO_ERROR)) cam_iosched_io_metric_update(isc, cam_iosched_sbintime_t(done_ccb->ccb_h.qos.periph_data), bp->bio_cmd, bp->bio_bcount); #endif return retval; } /* * Tell the io scheduler that you've pushed a trim down into the sim. * This also tells the I/O scheduler not to push any more trims down, so * some periphs do not call it if they can cope with multiple trims in flight. */ void cam_iosched_submit_trim(struct cam_iosched_softc *isc) { isc->flags |= CAM_IOSCHED_FLAG_TRIM_ACTIVE; } /* * Change the sorting policy hint for I/O transactions for this device. */ void cam_iosched_set_sort_queue(struct cam_iosched_softc *isc, int val) { isc->sort_io_queue = val; } int cam_iosched_has_work_flags(struct cam_iosched_softc *isc, uint32_t flags) { return isc->flags & flags; } void cam_iosched_set_work_flags(struct cam_iosched_softc *isc, uint32_t flags) { isc->flags |= flags; } void cam_iosched_clr_work_flags(struct cam_iosched_softc *isc, uint32_t flags) { isc->flags &= ~flags; } #ifdef CAM_IOSCHED_DYNAMIC /* * After the method presented in Jack Crenshaw's 1998 article "Integer * Square Roots," reprinted at * http://www.embedded.com/electronics-blogs/programmer-s-toolbox/4219659/Integer-Square-Roots * and well worth the read. Briefly, we find the power of 4 that's the * largest smaller than val. We then check each smaller power of 4 to * see if val is still bigger. The right shifts at each step divide * the result by 2 which after successive application winds up * accumulating the right answer. It could also have been accumulated * using a separate root counter, but this code is smaller and faster * than that method. This method is also integer size invariant. * It returns floor(sqrt((float)val)), or the largest integer less than * or equal to the square root. */ static uint64_t isqrt64(uint64_t val) { uint64_t res = 0; uint64_t bit = 1ULL << (sizeof(uint64_t) * NBBY - 2); /* * Find the largest power of 4 smaller than val. */ while (bit > val) bit >>= 2; /* * Accumulate the answer, one bit at a time (we keep moving * them over since 2 is the square root of 4 and we test * powers of 4). We accumulate where we find the bit, but * the successive shifts land the bit in the right place * by the end. */ while (bit != 0) { if (val >= res + bit) { val -= res + bit; res = (res >> 1) + bit; } else res >>= 1; bit >>= 2; } return res; } static sbintime_t latencies[LAT_BUCKETS - 1] = { SBT_1MS << 0, SBT_1MS << 1, SBT_1MS << 2, SBT_1MS << 3, SBT_1MS << 4, SBT_1MS << 5, SBT_1MS << 6, SBT_1MS << 7, SBT_1MS << 8, SBT_1MS << 9, SBT_1MS << 10, SBT_1MS << 11, SBT_1MS << 12, SBT_1MS << 13 /* 8.192s */ }; static void cam_iosched_update(struct iop_stats *iop, sbintime_t sim_latency) { sbintime_t y, deltasq, delta; int i; /* * Keep counts for latency. We do it by power of two buckets. * This helps us spot outlier behavior obscured by averages. */ for (i = 0; i < LAT_BUCKETS - 1; i++) { if (sim_latency < latencies[i]) { iop->latencies[i]++; break; } } if (i == LAT_BUCKETS - 1) iop->latencies[i]++; /* Put all > 1024ms values into the last bucket. */ /* * Classic exponentially decaying average with a tiny alpha * (2 ^ -alpha_bits). For more info see the NIST statistical * handbook. * * ema_t = y_t * alpha + ema_t-1 * (1 - alpha) [nist] * ema_t = y_t * alpha + ema_t-1 - alpha * ema_t-1 * ema_t = alpha * y_t - alpha * ema_t-1 + ema_t-1 * alpha = 1 / (1 << alpha_bits) * sub e == ema_t-1, b == 1/alpha (== 1 << alpha_bits), d == y_t - ema_t-1 * = y_t/b - e/b + be/b * = (y_t - e + be) / b * = (e + d) / b * * Since alpha is a power of two, we can compute this w/o any mult or * division. * * Variance can also be computed. Usually, it would be expressed as follows: * diff_t = y_t - ema_t-1 * emvar_t = (1 - alpha) * (emavar_t-1 + diff_t^2 * alpha) * = emavar_t-1 - alpha * emavar_t-1 + delta_t^2 * alpha - (delta_t * alpha)^2 * sub b == 1/alpha (== 1 << alpha_bits), e == emavar_t-1, d = delta_t^2 * = e - e/b + dd/b + dd/bb * = (bbe - be + bdd + dd) / bb * = (bbe + b(dd-e) + dd) / bb (which is expanded below bb = 1<<(2*alpha_bits)) */ /* * XXX possible numeric issues * o We assume right shifted integers do the right thing, since that's * implementation defined. You can change the right shifts to / (1LL << alpha). * o alpha_bits = 9 gives ema ceiling of 23 bits of seconds for ema and 14 bits * for emvar. This puts a ceiling of 13 bits on alpha since we need a * few tens of seconds of representation. * o We mitigate alpha issues by never setting it too high. */ y = sim_latency; delta = (y - iop->ema); /* d */ iop->ema = ((iop->ema << alpha_bits) + delta) >> alpha_bits; /* * Were we to naively plow ahead at this point, we wind up with many numerical * issues making any SD > ~3ms unreliable. So, we shift right by 12. This leaves * us with microsecond level precision in the input, so the same in the * output. It means we can't overflow deltasq unless delta > 4k seconds. It * also means that emvar can be up 46 bits 40 of which are fraction, which * gives us a way to measure up to ~8s in the SD before the computation goes * unstable. Even the worst hard disk rarely has > 1s service time in the * drive. It does mean we have to shift left 12 bits after taking the * square root to compute the actual standard deviation estimate. This loss of * precision is preferable to needing int128 types to work. The above numbers * assume alpha=9. 10 or 11 are ok, but we start to run into issues at 12, * so 12 or 13 is OK for EMA, EMVAR and SD will be wrong in those cases. */ delta >>= 12; deltasq = delta * delta; /* dd */ iop->emvar = ((iop->emvar << (2 * alpha_bits)) + /* bbe */ ((deltasq - iop->emvar) << alpha_bits) + /* b(dd-e) */ deltasq) /* dd */ >> (2 * alpha_bits); /* div bb */ iop->sd = (sbintime_t)isqrt64((uint64_t)iop->emvar) << 12; } static void cam_iosched_io_metric_update(struct cam_iosched_softc *isc, sbintime_t sim_latency, int cmd, size_t size) { /* xxx Do we need to scale based on the size of the I/O ? */ switch (cmd) { case BIO_READ: cam_iosched_update(&isc->read_stats, sim_latency); break; case BIO_WRITE: cam_iosched_update(&isc->write_stats, sim_latency); break; case BIO_DELETE: cam_iosched_update(&isc->trim_stats, sim_latency); break; default: break; } } #ifdef DDB static int biolen(struct bio_queue_head *bq) { int i = 0; struct bio *bp; TAILQ_FOREACH(bp, &bq->queue, bio_queue) { i++; } return i; } /* * Show the internal state of the I/O scheduler. */ DB_SHOW_COMMAND(iosched, cam_iosched_db_show) { struct cam_iosched_softc *isc; if (!have_addr) { db_printf("Need addr\n"); return; } isc = (struct cam_iosched_softc *)addr; db_printf("pending_reads: %d\n", isc->read_stats.pending); db_printf("min_reads: %d\n", isc->read_stats.min); db_printf("max_reads: %d\n", isc->read_stats.max); db_printf("reads: %d\n", isc->read_stats.total); db_printf("in_reads: %d\n", isc->read_stats.in); db_printf("out_reads: %d\n", isc->read_stats.out); db_printf("queued_reads: %d\n", isc->read_stats.queued); db_printf("Current Q len %d\n", biolen(&isc->bio_queue)); db_printf("pending_writes: %d\n", isc->write_stats.pending); db_printf("min_writes: %d\n", isc->write_stats.min); db_printf("max_writes: %d\n", isc->write_stats.max); db_printf("writes: %d\n", isc->write_stats.total); db_printf("in_writes: %d\n", isc->write_stats.in); db_printf("out_writes: %d\n", isc->write_stats.out); db_printf("queued_writes: %d\n", isc->write_stats.queued); db_printf("Current Q len %d\n", biolen(&isc->write_queue)); db_printf("pending_trims: %d\n", isc->trim_stats.pending); db_printf("min_trims: %d\n", isc->trim_stats.min); db_printf("max_trims: %d\n", isc->trim_stats.max); db_printf("trims: %d\n", isc->trim_stats.total); db_printf("in_trims: %d\n", isc->trim_stats.in); db_printf("out_trims: %d\n", isc->trim_stats.out); db_printf("queued_trims: %d\n", isc->trim_stats.queued); db_printf("Current Q len %d\n", biolen(&isc->trim_queue)); db_printf("read_bias: %d\n", isc->read_bias); db_printf("current_read_bias: %d\n", isc->current_read_bias); db_printf("Trim active? %s\n", (isc->flags & CAM_IOSCHED_FLAG_TRIM_ACTIVE) ? "yes" : "no"); } #endif #endif Index: head/sys/cam/cam_iosched.h =================================================================== --- head/sys/cam/cam_iosched.h (revision 329818) +++ head/sys/cam/cam_iosched.h (revision 329819) @@ -1,105 +1,104 @@ /*- * CAM IO Scheduler Interface * * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2015 Netflix, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions, and the following disclaimer, * without modification, immediately at the beginning of the file. * 2. The name of the author may not be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _CAM_CAM_IOSCHED_H #define _CAM_CAM_IOSCHED_H /* No user-serviceable parts in here. */ #ifdef _KERNEL /* Forward declare all structs to keep interface thin */ struct cam_iosched_softc; struct sysctl_ctx_list; struct sysctl_oid; union ccb; struct bio; /* * For 64-bit platforms, we know that uintptr_t is the same size as sbintime_t * so we can store values in it. For 32-bit systems, however, uintptr_t is only * 32-bits, so it won't fit. For those systems, store 24 bits of fraction and 8 * bits of seconds. This allows us to measure an interval of up to ~256s, which * is ~200x what our current uses require. Provide some convenience functions to * get the time, subtract two times and convert back to sbintime_t in a safe way * that can be centralized. */ #ifdef __LP64__ #define CAM_IOSCHED_TIME_SHIFT 0 #else #define CAM_IOSCHED_TIME_SHIFT 8 #endif static inline uintptr_t cam_iosched_now(void) { /* Cast here is to avoid right shifting a signed value */ return (uintptr_t)((uint64_t)sbinuptime() >> CAM_IOSCHED_TIME_SHIFT); } static inline uintptr_t cam_iosched_delta_t(uintptr_t then) { /* Since the types are identical, wrapping works correctly */ return (cam_iosched_now() - then); } static inline sbintime_t cam_iosched_sbintime_t(uintptr_t delta) { /* Cast here is to widen the type so the left shift doesn't lose precision */ return (sbintime_t)((uint64_t)delta << CAM_IOSCHED_TIME_SHIFT); } -#define CAM_IOSCHED_CAP_TRIM_CLOCKED 0x1 - -int cam_iosched_init(struct cam_iosched_softc **, struct cam_periph *periph, u_int caps); +int cam_iosched_init(struct cam_iosched_softc **, struct cam_periph *periph); void cam_iosched_fini(struct cam_iosched_softc *); void cam_iosched_sysctl_init(struct cam_iosched_softc *, struct sysctl_ctx_list *, struct sysctl_oid *); struct bio *cam_iosched_next_trim(struct cam_iosched_softc *isc); +struct bio *cam_iosched_get_trim(struct cam_iosched_softc *isc); struct bio *cam_iosched_next_bio(struct cam_iosched_softc *isc); void cam_iosched_queue_work(struct cam_iosched_softc *isc, struct bio *bp); void cam_iosched_flush(struct cam_iosched_softc *isc, struct devstat *stp, int err); void cam_iosched_schedule(struct cam_iosched_softc *isc, struct cam_periph *periph); void cam_iosched_finish_trim(struct cam_iosched_softc *isc); void cam_iosched_submit_trim(struct cam_iosched_softc *isc); void cam_iosched_put_back_trim(struct cam_iosched_softc *isc, struct bio *bp); void cam_iosched_set_sort_queue(struct cam_iosched_softc *isc, int val); int cam_iosched_has_work_flags(struct cam_iosched_softc *isc, uint32_t flags); void cam_iosched_set_work_flags(struct cam_iosched_softc *isc, uint32_t flags); void cam_iosched_clr_work_flags(struct cam_iosched_softc *isc, uint32_t flags); void cam_iosched_trim_done(struct cam_iosched_softc *isc); int cam_iosched_bio_complete(struct cam_iosched_softc *isc, struct bio *bp, union ccb *done_ccb); #endif #endif Index: head/sys/cam/nvme/nvme_da.c =================================================================== --- head/sys/cam/nvme/nvme_da.c (revision 329818) +++ head/sys/cam/nvme/nvme_da.c (revision 329819) @@ -1,1170 +1,1145 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2015 Netflix, Inc * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer, * without modification, immediately at the beginning of the file. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * Derived from ata_da.c: * Copyright (c) 2009 Alexander Motin */ #include __FBSDID("$FreeBSD$"); #include #ifdef _KERNEL #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #endif /* _KERNEL */ #ifndef _KERNEL #include #include #endif /* _KERNEL */ #include #include #include #include #include #include #include typedef enum { NDA_STATE_NORMAL } nda_state; typedef enum { NDA_FLAG_OPEN = 0x0001, NDA_FLAG_DIRTY = 0x0002, NDA_FLAG_SCTX_INIT = 0x0004, } nda_flags; typedef enum { NDA_Q_4K = 0x01, NDA_Q_NONE = 0x00, } nda_quirks; #define NDA_Q_BIT_STRING \ "\020" \ "\001Bit 0" typedef enum { NDA_CCB_BUFFER_IO = 0x01, NDA_CCB_DUMP = 0x02, NDA_CCB_TRIM = 0x03, NDA_CCB_TYPE_MASK = 0x0F, } nda_ccb_state; /* Offsets into our private area for storing information */ #define ccb_state ppriv_field0 #define ccb_bp ppriv_ptr1 struct trim_request { TAILQ_HEAD(, bio) bps; }; struct nda_softc { struct cam_iosched_softc *cam_iosched; int outstanding_cmds; /* Number of active commands */ int refcount; /* Active xpt_action() calls */ nda_state state; nda_flags flags; nda_quirks quirks; int unmappedio; uint32_t nsid; /* Namespace ID for this nda device */ struct disk *disk; struct task sysctl_task; struct sysctl_ctx_list sysctl_ctx; struct sysctl_oid *sysctl_tree; struct trim_request trim_req; #ifdef CAM_IO_STATS struct sysctl_ctx_list sysctl_stats_ctx; struct sysctl_oid *sysctl_stats_tree; u_int timeouts; u_int errors; u_int invalidations; #endif }; -struct nda_trim_request { - union { - struct nvme_dsm_range dsm; - uint8_t data[NVME_MAX_DSM_TRIM]; - } u; - TAILQ_HEAD(, bio) bps; -}; - /* Need quirk table */ static disk_strategy_t ndastrategy; static dumper_t ndadump; static periph_init_t ndainit; static void ndaasync(void *callback_arg, u_int32_t code, struct cam_path *path, void *arg); static void ndasysctlinit(void *context, int pending); static periph_ctor_t ndaregister; static periph_dtor_t ndacleanup; static periph_start_t ndastart; static periph_oninv_t ndaoninvalidate; static void ndadone(struct cam_periph *periph, union ccb *done_ccb); static int ndaerror(union ccb *ccb, u_int32_t cam_flags, u_int32_t sense_flags); static void ndashutdown(void *arg, int howto); static void ndasuspend(void *arg); #ifndef NDA_DEFAULT_SEND_ORDERED #define NDA_DEFAULT_SEND_ORDERED 1 #endif #ifndef NDA_DEFAULT_TIMEOUT #define NDA_DEFAULT_TIMEOUT 30 /* Timeout in seconds */ #endif #ifndef NDA_DEFAULT_RETRY #define NDA_DEFAULT_RETRY 4 #endif -#ifndef NDA_MAX_TRIM_ENTRIES -#define NDA_MAX_TRIM_ENTRIES 256 /* Number of DSM trims to use, max 256 */ -#endif + //static int nda_retry_count = NDA_DEFAULT_RETRY; static int nda_send_ordered = NDA_DEFAULT_SEND_ORDERED; static int nda_default_timeout = NDA_DEFAULT_TIMEOUT; -static int nda_max_trim_entries = NDA_MAX_TRIM_ENTRIES; /* * All NVMe media is non-rotational, so all nvme device instances * share this to implement the sysctl. */ static int nda_rotating_media = 0; static SYSCTL_NODE(_kern_cam, OID_AUTO, nda, CTLFLAG_RD, 0, "CAM Direct Access Disk driver"); static struct periph_driver ndadriver = { ndainit, "nda", TAILQ_HEAD_INITIALIZER(ndadriver.units), /* generation */ 0 }; PERIPHDRIVER_DECLARE(nda, ndadriver); static MALLOC_DEFINE(M_NVMEDA, "nvme_da", "nvme_da buffers"); /* * nice wrappers. Maybe these belong in nvme_all.c instead of * here, but this is the only place that uses these. Should * we ever grow another NVME periph, we should move them * all there wholesale. */ static void nda_nvme_flush(struct nda_softc *softc, struct ccb_nvmeio *nvmeio) { cam_fill_nvmeio(nvmeio, 0, /* retries */ ndadone, /* cbfcnp */ CAM_DIR_NONE, /* flags */ NULL, /* data_ptr */ 0, /* dxfer_len */ nda_default_timeout * 1000); /* timeout 30s */ nvme_ns_flush_cmd(&nvmeio->cmd, softc->nsid); } static void nda_nvme_trim(struct nda_softc *softc, struct ccb_nvmeio *nvmeio, void *payload, uint32_t num_ranges) { cam_fill_nvmeio(nvmeio, 0, /* retries */ ndadone, /* cbfcnp */ CAM_DIR_OUT, /* flags */ payload, /* data_ptr */ num_ranges * sizeof(struct nvme_dsm_range), /* dxfer_len */ nda_default_timeout * 1000); /* timeout 30s */ nvme_ns_trim_cmd(&nvmeio->cmd, softc->nsid, num_ranges); } static void nda_nvme_write(struct nda_softc *softc, struct ccb_nvmeio *nvmeio, void *payload, uint64_t lba, uint32_t len, uint32_t count) { cam_fill_nvmeio(nvmeio, 0, /* retries */ ndadone, /* cbfcnp */ CAM_DIR_OUT, /* flags */ payload, /* data_ptr */ len, /* dxfer_len */ nda_default_timeout * 1000); /* timeout 30s */ nvme_ns_write_cmd(&nvmeio->cmd, softc->nsid, lba, count); } static void nda_nvme_rw_bio(struct nda_softc *softc, struct ccb_nvmeio *nvmeio, struct bio *bp, uint32_t rwcmd) { int flags = rwcmd == NVME_OPC_READ ? CAM_DIR_IN : CAM_DIR_OUT; void *payload; uint64_t lba; uint32_t count; if (bp->bio_flags & BIO_UNMAPPED) { flags |= CAM_DATA_BIO; payload = bp; } else { payload = bp->bio_data; } lba = bp->bio_pblkno; count = bp->bio_bcount / softc->disk->d_sectorsize; cam_fill_nvmeio(nvmeio, 0, /* retries */ ndadone, /* cbfcnp */ flags, /* flags */ payload, /* data_ptr */ bp->bio_bcount, /* dxfer_len */ nda_default_timeout * 1000); /* timeout 30s */ nvme_ns_rw_cmd(&nvmeio->cmd, rwcmd, softc->nsid, lba, count); } static int ndaopen(struct disk *dp) { struct cam_periph *periph; struct nda_softc *softc; int error; periph = (struct cam_periph *)dp->d_drv1; if (cam_periph_acquire(periph) != 0) { return(ENXIO); } cam_periph_lock(periph); if ((error = cam_periph_hold(periph, PRIBIO|PCATCH)) != 0) { cam_periph_unlock(periph); cam_periph_release(periph); return (error); } CAM_DEBUG(periph->path, CAM_DEBUG_TRACE | CAM_DEBUG_PERIPH, ("ndaopen\n")); softc = (struct nda_softc *)periph->softc; softc->flags |= NDA_FLAG_OPEN; cam_periph_unhold(periph); cam_periph_unlock(periph); return (0); } static int ndaclose(struct disk *dp) { struct cam_periph *periph; struct nda_softc *softc; union ccb *ccb; int error; periph = (struct cam_periph *)dp->d_drv1; softc = (struct nda_softc *)periph->softc; cam_periph_lock(periph); CAM_DEBUG(periph->path, CAM_DEBUG_TRACE | CAM_DEBUG_PERIPH, ("ndaclose\n")); if ((softc->flags & NDA_FLAG_DIRTY) != 0 && (periph->flags & CAM_PERIPH_INVALID) == 0 && cam_periph_hold(periph, PRIBIO) == 0) { ccb = cam_periph_getccb(periph, CAM_PRIORITY_NORMAL); nda_nvme_flush(softc, &ccb->nvmeio); error = cam_periph_runccb(ccb, ndaerror, /*cam_flags*/0, /*sense_flags*/0, softc->disk->d_devstat); if (error != 0) xpt_print(periph->path, "Synchronize cache failed\n"); else softc->flags &= ~NDA_FLAG_DIRTY; xpt_release_ccb(ccb); cam_periph_unhold(periph); } softc->flags &= ~NDA_FLAG_OPEN; while (softc->refcount != 0) cam_periph_sleep(periph, &softc->refcount, PRIBIO, "ndaclose", 1); cam_periph_unlock(periph); cam_periph_release(periph); return (0); } static void ndaschedule(struct cam_periph *periph) { struct nda_softc *softc = (struct nda_softc *)periph->softc; if (softc->state != NDA_STATE_NORMAL) return; cam_iosched_schedule(softc->cam_iosched, periph); } /* * Actually translate the requested transfer into one the physical driver * can understand. The transfer is described by a buf and will include * only one physical transfer. */ static void ndastrategy(struct bio *bp) { struct cam_periph *periph; struct nda_softc *softc; periph = (struct cam_periph *)bp->bio_disk->d_drv1; softc = (struct nda_softc *)periph->softc; cam_periph_lock(periph); CAM_DEBUG(periph->path, CAM_DEBUG_TRACE, ("ndastrategy(%p)\n", bp)); /* * If the device has been made invalid, error out */ if ((periph->flags & CAM_PERIPH_INVALID) != 0) { cam_periph_unlock(periph); biofinish(bp, NULL, ENXIO); return; } /* * Place it in the queue of disk activities for this disk */ cam_iosched_queue_work(softc->cam_iosched, bp); /* * Schedule ourselves for performing the work. */ ndaschedule(periph); cam_periph_unlock(periph); return; } static int ndadump(void *arg, void *virtual, vm_offset_t physical, off_t offset, size_t length) { struct cam_periph *periph; struct nda_softc *softc; u_int secsize; struct ccb_nvmeio nvmeio; struct disk *dp; uint64_t lba; uint32_t count; int error = 0; dp = arg; periph = dp->d_drv1; softc = (struct nda_softc *)periph->softc; secsize = softc->disk->d_sectorsize; lba = offset / secsize; count = length / secsize; if ((periph->flags & CAM_PERIPH_INVALID) != 0) return (ENXIO); /* xpt_get_ccb returns a zero'd allocation for the ccb, mimic that here */ memset(&nvmeio, 0, sizeof(nvmeio)); if (length > 0) { xpt_setup_ccb(&nvmeio.ccb_h, periph->path, CAM_PRIORITY_NORMAL); nvmeio.ccb_h.ccb_state = NDA_CCB_DUMP; nda_nvme_write(softc, &nvmeio, virtual, lba, length, count); error = cam_periph_runccb((union ccb *)&nvmeio, cam_periph_error, 0, SF_NO_RECOVERY | SF_NO_RETRY, NULL); if (error != 0) printf("Aborting dump due to I/O error %d.\n", error); return (error); } /* Flush */ xpt_setup_ccb(&nvmeio.ccb_h, periph->path, CAM_PRIORITY_NORMAL); nvmeio.ccb_h.ccb_state = NDA_CCB_DUMP; nda_nvme_flush(softc, &nvmeio); error = cam_periph_runccb((union ccb *)&nvmeio, cam_periph_error, 0, SF_NO_RECOVERY | SF_NO_RETRY, NULL); if (error != 0) xpt_print(periph->path, "flush cmd failed\n"); return (error); } static void ndainit(void) { cam_status status; /* * Install a global async callback. This callback will * receive async callbacks like "new device found". */ status = xpt_register_async(AC_FOUND_DEVICE, ndaasync, NULL, NULL); if (status != CAM_REQ_CMP) { printf("nda: Failed to attach master async callback " "due to status 0x%x!\n", status); } else if (nda_send_ordered) { /* Register our event handlers */ if ((EVENTHANDLER_REGISTER(power_suspend, ndasuspend, NULL, EVENTHANDLER_PRI_LAST)) == NULL) printf("ndainit: power event registration failed!\n"); if ((EVENTHANDLER_REGISTER(shutdown_post_sync, ndashutdown, NULL, SHUTDOWN_PRI_DEFAULT)) == NULL) printf("ndainit: shutdown event registration failed!\n"); } } /* * Callback from GEOM, called when it has finished cleaning up its * resources. */ static void ndadiskgonecb(struct disk *dp) { struct cam_periph *periph; periph = (struct cam_periph *)dp->d_drv1; cam_periph_release(periph); } static void ndaoninvalidate(struct cam_periph *periph) { struct nda_softc *softc; softc = (struct nda_softc *)periph->softc; /* * De-register any async callbacks. */ xpt_register_async(0, ndaasync, periph, periph->path); #ifdef CAM_IO_STATS softc->invalidations++; #endif /* * Return all queued I/O with ENXIO. * XXX Handle any transactions queued to the card * with XPT_ABORT_CCB. */ cam_iosched_flush(softc->cam_iosched, NULL, ENXIO); disk_gone(softc->disk); } static void ndacleanup(struct cam_periph *periph) { struct nda_softc *softc; softc = (struct nda_softc *)periph->softc; cam_periph_unlock(periph); cam_iosched_fini(softc->cam_iosched); /* * If we can't free the sysctl tree, oh well... */ if ((softc->flags & NDA_FLAG_SCTX_INIT) != 0) { #ifdef CAM_IO_STATS if (sysctl_ctx_free(&softc->sysctl_stats_ctx) != 0) xpt_print(periph->path, "can't remove sysctl stats context\n"); #endif if (sysctl_ctx_free(&softc->sysctl_ctx) != 0) xpt_print(periph->path, "can't remove sysctl context\n"); } disk_destroy(softc->disk); free(softc, M_DEVBUF); cam_periph_lock(periph); } static void ndaasync(void *callback_arg, u_int32_t code, struct cam_path *path, void *arg) { struct cam_periph *periph; periph = (struct cam_periph *)callback_arg; switch (code) { case AC_FOUND_DEVICE: { struct ccb_getdev *cgd; cam_status status; cgd = (struct ccb_getdev *)arg; if (cgd == NULL) break; if (cgd->protocol != PROTO_NVME) break; /* * Allocate a peripheral instance for * this device and start the probe * process. */ status = cam_periph_alloc(ndaregister, ndaoninvalidate, ndacleanup, ndastart, "nda", CAM_PERIPH_BIO, path, ndaasync, AC_FOUND_DEVICE, cgd); if (status != CAM_REQ_CMP && status != CAM_REQ_INPROG) printf("ndaasync: Unable to attach to new device " "due to status 0x%x\n", status); break; } case AC_ADVINFO_CHANGED: { uintptr_t buftype; buftype = (uintptr_t)arg; if (buftype == CDAI_TYPE_PHYS_PATH) { struct nda_softc *softc; softc = periph->softc; disk_attr_changed(softc->disk, "GEOM::physpath", M_NOWAIT); } break; } case AC_LOST_DEVICE: default: cam_periph_async(periph, code, path, arg); break; } } static void ndasysctlinit(void *context, int pending) { struct cam_periph *periph; struct nda_softc *softc; char tmpstr[32], tmpstr2[16]; periph = (struct cam_periph *)context; /* periph was held for us when this task was enqueued */ if ((periph->flags & CAM_PERIPH_INVALID) != 0) { cam_periph_release(periph); return; } softc = (struct nda_softc *)periph->softc; snprintf(tmpstr, sizeof(tmpstr), "CAM NDA unit %d", periph->unit_number); snprintf(tmpstr2, sizeof(tmpstr2), "%d", periph->unit_number); sysctl_ctx_init(&softc->sysctl_ctx); softc->flags |= NDA_FLAG_SCTX_INIT; softc->sysctl_tree = SYSCTL_ADD_NODE_WITH_LABEL(&softc->sysctl_ctx, SYSCTL_STATIC_CHILDREN(_kern_cam_nda), OID_AUTO, tmpstr2, CTLFLAG_RD, 0, tmpstr, "device_index"); if (softc->sysctl_tree == NULL) { printf("ndasysctlinit: unable to allocate sysctl tree\n"); cam_periph_release(periph); return; } SYSCTL_ADD_INT(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree), OID_AUTO, "unmapped_io", CTLFLAG_RD | CTLFLAG_MPSAFE, &softc->unmappedio, 0, "Unmapped I/O leaf"); SYSCTL_ADD_INT(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree), OID_AUTO, "rotating", CTLFLAG_RD | CTLFLAG_MPSAFE, &nda_rotating_media, 0, "Rotating media"); #ifdef CAM_IO_STATS softc->sysctl_stats_tree = SYSCTL_ADD_NODE(&softc->sysctl_stats_ctx, SYSCTL_CHILDREN(softc->sysctl_tree), OID_AUTO, "stats", CTLFLAG_RD, 0, "Statistics"); if (softc->sysctl_stats_tree == NULL) { printf("ndasysctlinit: unable to allocate sysctl tree for stats\n"); cam_periph_release(periph); return; } SYSCTL_ADD_INT(&softc->sysctl_stats_ctx, SYSCTL_CHILDREN(softc->sysctl_stats_tree), OID_AUTO, "timeouts", CTLFLAG_RD | CTLFLAG_MPSAFE, &softc->timeouts, 0, "Device timeouts reported by the SIM"); SYSCTL_ADD_INT(&softc->sysctl_stats_ctx, SYSCTL_CHILDREN(softc->sysctl_stats_tree), OID_AUTO, "errors", CTLFLAG_RD | CTLFLAG_MPSAFE, &softc->errors, 0, "Transport errors reported by the SIM."); SYSCTL_ADD_INT(&softc->sysctl_stats_ctx, SYSCTL_CHILDREN(softc->sysctl_stats_tree), OID_AUTO, "pack_invalidations", CTLFLAG_RD | CTLFLAG_MPSAFE, &softc->invalidations, 0, "Device pack invalidations."); #endif cam_iosched_sysctl_init(softc->cam_iosched, &softc->sysctl_ctx, softc->sysctl_tree); cam_periph_release(periph); } static int ndagetattr(struct bio *bp) { int ret; struct cam_periph *periph; periph = (struct cam_periph *)bp->bio_disk->d_drv1; cam_periph_lock(periph); ret = xpt_getattr(bp->bio_data, bp->bio_length, bp->bio_attribute, periph->path); cam_periph_unlock(periph); if (ret == 0) bp->bio_completed = bp->bio_length; return ret; } static cam_status ndaregister(struct cam_periph *periph, void *arg) { struct nda_softc *softc; struct disk *disk; struct ccb_pathinq cpi; const struct nvme_namespace_data *nsd; const struct nvme_controller_data *cd; char announce_buf[80]; u_int maxio; int quirks; nsd = nvme_get_identify_ns(periph); cd = nvme_get_identify_cntrl(periph); softc = (struct nda_softc *)malloc(sizeof(*softc), M_DEVBUF, M_NOWAIT | M_ZERO); if (softc == NULL) { printf("ndaregister: Unable to probe new device. " "Unable to allocate softc\n"); return(CAM_REQ_CMP_ERR); } - if (cam_iosched_init(&softc->cam_iosched, periph, - CAM_IOSCHED_CAP_TRIM_CLOCKED) != 0) { + if (cam_iosched_init(&softc->cam_iosched, periph) != 0) { printf("ndaregister: Unable to probe new device. " "Unable to allocate iosched memory\n"); return(CAM_REQ_CMP_ERR); } /* ident_data parsing */ periph->softc = softc; softc->quirks = NDA_Q_NONE; xpt_path_inq(&cpi, periph->path); TASK_INIT(&softc->sysctl_task, 0, ndasysctlinit, periph); /* * The name space ID is the lun, save it for later I/O */ softc->nsid = (uint32_t)xpt_path_lun_id(periph->path); /* * Register this media as a disk */ (void)cam_periph_hold(periph, PRIBIO); cam_periph_unlock(periph); snprintf(announce_buf, sizeof(announce_buf), "kern.cam.nda.%d.quirks", periph->unit_number); quirks = softc->quirks; TUNABLE_INT_FETCH(announce_buf, &quirks); softc->quirks = quirks; cam_iosched_set_sort_queue(softc->cam_iosched, 0); softc->disk = disk = disk_alloc(); strlcpy(softc->disk->d_descr, cd->mn, MIN(sizeof(softc->disk->d_descr), sizeof(cd->mn))); strlcpy(softc->disk->d_ident, cd->sn, MIN(sizeof(softc->disk->d_ident), sizeof(cd->sn))); disk->d_rotation_rate = DISK_RR_NON_ROTATING; disk->d_open = ndaopen; disk->d_close = ndaclose; disk->d_strategy = ndastrategy; disk->d_getattr = ndagetattr; disk->d_dump = ndadump; disk->d_gone = ndadiskgonecb; disk->d_name = "nda"; disk->d_drv1 = periph; disk->d_unit = periph->unit_number; maxio = cpi.maxio; /* Honor max I/O size of SIM */ if (maxio == 0) maxio = DFLTPHYS; /* traditional default */ else if (maxio > MAXPHYS) maxio = MAXPHYS; /* for safety */ disk->d_maxsize = maxio; disk->d_sectorsize = 1 << nsd->lbaf[nsd->flbas.format].lbads; disk->d_mediasize = (off_t)(disk->d_sectorsize * nsd->nsze); disk->d_delmaxsize = disk->d_mediasize; disk->d_flags = DISKFLAG_DIRECT_COMPLETION; // if (cd->oncs.dsm) // XXX broken? disk->d_flags |= DISKFLAG_CANDELETE; if (cd->vwc.present) disk->d_flags |= DISKFLAG_CANFLUSHCACHE; if ((cpi.hba_misc & PIM_UNMAPPED) != 0) { disk->d_flags |= DISKFLAG_UNMAPPED_BIO; softc->unmappedio = 1; } /* * d_ident and d_descr are both far bigger than the length of either * the serial or model number strings. */ nvme_strvis(disk->d_descr, cd->mn, sizeof(disk->d_descr), NVME_MODEL_NUMBER_LENGTH); nvme_strvis(disk->d_ident, cd->sn, sizeof(disk->d_ident), NVME_SERIAL_NUMBER_LENGTH); disk->d_hba_vendor = cpi.hba_vendor; disk->d_hba_device = cpi.hba_device; disk->d_hba_subvendor = cpi.hba_subvendor; disk->d_hba_subdevice = cpi.hba_subdevice; disk->d_stripesize = disk->d_sectorsize; disk->d_stripeoffset = 0; disk->d_devstat = devstat_new_entry(periph->periph_name, periph->unit_number, disk->d_sectorsize, DEVSTAT_ALL_SUPPORTED, DEVSTAT_TYPE_DIRECT | XPORT_DEVSTAT_TYPE(cpi.transport), DEVSTAT_PRIORITY_DISK); /* * Add alias for older nvd drives to ease transition. */ /* disk_add_alias(disk, "nvd"); Have reports of this causing problems */ /* * Acquire a reference to the periph before we register with GEOM. * We'll release this reference once GEOM calls us back (via * ndadiskgonecb()) telling us that our provider has been freed. */ if (cam_periph_acquire(periph) != 0) { xpt_print(periph->path, "%s: lost periph during " "registration!\n", __func__); cam_periph_lock(periph); return (CAM_REQ_CMP_ERR); } disk_create(softc->disk, DISK_VERSION); cam_periph_lock(periph); cam_periph_unhold(periph); snprintf(announce_buf, sizeof(announce_buf), "%juMB (%ju %u byte sectors)", (uintmax_t)((uintmax_t)disk->d_mediasize / (1024*1024)), (uintmax_t)disk->d_mediasize / disk->d_sectorsize, disk->d_sectorsize); xpt_announce_periph(periph, announce_buf); xpt_announce_quirks(periph, softc->quirks, NDA_Q_BIT_STRING); /* * Create our sysctl variables, now that we know * we have successfully attached. */ if (cam_periph_acquire(periph) == 0) taskqueue_enqueue(taskqueue_thread, &softc->sysctl_task); /* * Register for device going away and info about the drive * changing (though with NVMe, it can't) */ xpt_register_async(AC_LOST_DEVICE | AC_ADVINFO_CHANGED, ndaasync, periph, periph->path); softc->state = NDA_STATE_NORMAL; return(CAM_REQ_CMP); } static void ndastart(struct cam_periph *periph, union ccb *start_ccb) { struct nda_softc *softc = (struct nda_softc *)periph->softc; struct ccb_nvmeio *nvmeio = &start_ccb->nvmeio; CAM_DEBUG(periph->path, CAM_DEBUG_TRACE, ("ndastart\n")); switch (softc->state) { case NDA_STATE_NORMAL: { struct bio *bp; bp = cam_iosched_next_bio(softc->cam_iosched); CAM_DEBUG(periph->path, CAM_DEBUG_TRACE, ("ndastart: bio %p\n", bp)); if (bp == NULL) { xpt_release_ccb(start_ccb); break; } switch (bp->bio_cmd) { case BIO_WRITE: softc->flags |= NDA_FLAG_DIRTY; /* FALLTHROUGH */ case BIO_READ: { #ifdef NDA_TEST_FAILURE int fail = 0; /* * Support the failure ioctls. If the command is a * read, and there are pending forced read errors, or * if a write and pending write errors, then fail this * operation with EIO. This is useful for testing * purposes. Also, support having every Nth read fail. * * This is a rather blunt tool. */ if (bp->bio_cmd == BIO_READ) { if (softc->force_read_error) { softc->force_read_error--; fail = 1; } if (softc->periodic_read_error > 0) { if (++softc->periodic_read_count >= softc->periodic_read_error) { softc->periodic_read_count = 0; fail = 1; } } } else { if (softc->force_write_error) { softc->force_write_error--; fail = 1; } } if (fail) { biofinish(bp, NULL, EIO); xpt_release_ccb(start_ccb); ndaschedule(periph); return; } #endif KASSERT((bp->bio_flags & BIO_UNMAPPED) == 0 || round_page(bp->bio_bcount + bp->bio_ma_offset) / PAGE_SIZE == bp->bio_ma_n, ("Short bio %p", bp)); nda_nvme_rw_bio(softc, &start_ccb->nvmeio, bp, bp->bio_cmd == BIO_READ ? NVME_OPC_READ : NVME_OPC_WRITE); break; } case BIO_DELETE: { - struct nvme_dsm_range *dsm_range, *dsm_end; - struct nda_trim_request *trim; - struct bio *bp1; - int ents; + struct nvme_dsm_range *dsm_range; - trim = malloc(sizeof(*trim), M_NVMEDA, M_ZERO | M_NOWAIT); - if (trim == NULL) { + dsm_range = + malloc(sizeof(*dsm_range), M_NVMEDA, M_ZERO | M_NOWAIT); + if (dsm_range == NULL) { biofinish(bp, NULL, ENOMEM); xpt_release_ccb(start_ccb); ndaschedule(periph); return; } - TAILQ_INIT(&trim->bps); - bp1 = bp; - ents = sizeof(trim->u.data) / sizeof(struct nvme_dsm_range); - ents = min(ents, nda_max_trim_entries); - dsm_range = &trim->u.dsm; - dsm_end = dsm_range + ents; - do { - TAILQ_INSERT_TAIL(&trim->bps, bp1, bio_queue); - dsm_range->length = - bp1->bio_bcount / softc->disk->d_sectorsize; - dsm_range->starting_lba = - bp1->bio_offset / softc->disk->d_sectorsize; - dsm_range++; - if (dsm_range >= dsm_end) - break; - bp1 = cam_iosched_next_trim(softc->cam_iosched); - /* XXX -- Could collapse adjacent ranges, but we don't for now */ - /* XXX -- Could limit based on total payload size */ - } while (bp1 != NULL); - bp->bio_driver2 = trim; - nda_nvme_trim(softc, &start_ccb->nvmeio, &trim->u.dsm, - dsm_range - &trim->u.dsm); + dsm_range->length = + bp->bio_bcount / softc->disk->d_sectorsize; + dsm_range->starting_lba = + bp->bio_offset / softc->disk->d_sectorsize; + bp->bio_driver2 = dsm_range; + nda_nvme_trim(softc, &start_ccb->nvmeio, dsm_range, 1); start_ccb->ccb_h.ccb_state = NDA_CCB_TRIM; start_ccb->ccb_h.flags |= CAM_UNLOCKED; /* * Note: We can have multiple TRIMs in flight, so we don't call * cam_iosched_submit_trim(softc->cam_iosched); * since that forces the I/O scheduler to only schedule one at a time. * On NVMe drives, this is a performance disaster. */ goto out; } case BIO_FLUSH: nda_nvme_flush(softc, nvmeio); break; } start_ccb->ccb_h.ccb_state = NDA_CCB_BUFFER_IO; start_ccb->ccb_h.flags |= CAM_UNLOCKED; out: start_ccb->ccb_h.ccb_bp = bp; softc->outstanding_cmds++; softc->refcount++; cam_periph_unlock(periph); xpt_action(start_ccb); cam_periph_lock(periph); softc->refcount--; /* May have more work to do, so ensure we stay scheduled */ ndaschedule(periph); break; } } } static void ndadone(struct cam_periph *periph, union ccb *done_ccb) { struct nda_softc *softc; struct ccb_nvmeio *nvmeio = &done_ccb->nvmeio; struct cam_path *path; int state; softc = (struct nda_softc *)periph->softc; path = done_ccb->ccb_h.path; CAM_DEBUG(path, CAM_DEBUG_TRACE, ("ndadone\n")); state = nvmeio->ccb_h.ccb_state & NDA_CCB_TYPE_MASK; switch (state) { case NDA_CCB_BUFFER_IO: case NDA_CCB_TRIM: { struct bio *bp; int error; cam_periph_lock(periph); bp = (struct bio *)done_ccb->ccb_h.ccb_bp; if ((done_ccb->ccb_h.status & CAM_STATUS_MASK) != CAM_REQ_CMP) { error = ndaerror(done_ccb, 0, 0); if (error == ERESTART) { /* A retry was scheduled, so just return. */ cam_periph_unlock(periph); return; } if ((done_ccb->ccb_h.status & CAM_DEV_QFRZN) != 0) cam_release_devq(path, /*relsim_flags*/0, /*reduction*/0, /*timeout*/0, /*getcount_only*/0); } else { if ((done_ccb->ccb_h.status & CAM_DEV_QFRZN) != 0) panic("REQ_CMP with QFRZN"); error = 0; } bp->bio_error = error; if (error != 0) { bp->bio_resid = bp->bio_bcount; bp->bio_flags |= BIO_ERROR; } else { bp->bio_resid = 0; } + if (state == NDA_CCB_TRIM) + free(bp->bio_driver2, M_NVMEDA); softc->outstanding_cmds--; /* * We need to call cam_iosched before we call biodone so that we * don't measure any activity that happens in the completion * routine, which in the case of sendfile can be quite * extensive. */ cam_iosched_bio_complete(softc->cam_iosched, bp, done_ccb); xpt_release_ccb(done_ccb); if (state == NDA_CCB_TRIM) { - struct nda_trim_request *trim; - struct bio *bp1; +#ifdef notyet TAILQ_HEAD(, bio) queue; + struct bio *bp1; - trim = bp->bio_driver2; TAILQ_INIT(&queue); - TAILQ_CONCAT(&queue, &trim->bps, bio_queue); - free(trim, M_NVMEDA); - + TAILQ_CONCAT(&queue, &softc->trim_req.bps, bio_queue); +#endif /* * Since we can have multiple trims in flight, we don't * need to call this here. * cam_iosched_trim_done(softc->cam_iosched); */ ndaschedule(periph); cam_periph_unlock(periph); +#ifdef notyet +/* Not yet collapsing several BIO_DELETE requests into one TRIM */ while ((bp1 = TAILQ_FIRST(&queue)) != NULL) { TAILQ_REMOVE(&queue, bp1, bio_queue); bp1->bio_error = error; if (error != 0) { bp1->bio_flags |= BIO_ERROR; bp1->bio_resid = bp1->bio_bcount; } else bp1->bio_resid = 0; biodone(bp1); } +#else + biodone(bp); +#endif } else { ndaschedule(periph); cam_periph_unlock(periph); biodone(bp); } return; } case NDA_CCB_DUMP: /* No-op. We're polling */ return; default: break; } xpt_release_ccb(done_ccb); } static int ndaerror(union ccb *ccb, u_int32_t cam_flags, u_int32_t sense_flags) { struct nda_softc *softc; struct cam_periph *periph; periph = xpt_path_periph(ccb->ccb_h.path); softc = (struct nda_softc *)periph->softc; switch (ccb->ccb_h.status & CAM_STATUS_MASK) { case CAM_CMD_TIMEOUT: #ifdef CAM_IO_STATS softc->timeouts++; #endif break; case CAM_REQ_ABORTED: case CAM_REQ_CMP_ERR: case CAM_REQ_TERMIO: case CAM_UNREC_HBA_ERROR: case CAM_DATA_RUN_ERR: case CAM_ATA_STATUS_ERROR: #ifdef CAM_IO_STATS softc->errors++; #endif break; default: break; } return(cam_periph_error(ccb, cam_flags, sense_flags)); } /* * Step through all NDA peripheral drivers, and if the device is still open, * sync the disk cache to physical media. */ static void ndaflush(void) { struct cam_periph *periph; struct nda_softc *softc; union ccb *ccb; int error; CAM_PERIPH_FOREACH(periph, &ndadriver) { softc = (struct nda_softc *)periph->softc; if (SCHEDULER_STOPPED()) { /* * If we paniced with the lock held or the periph is not * open, do not recurse. Otherwise, call ndadump since * that avoids the sleeping cam_periph_getccb does if no * CCBs are available. */ if (!cam_periph_owned(periph) && (softc->flags & NDA_FLAG_OPEN)) { ndadump(softc->disk, NULL, 0, 0, 0); } continue; } /* * We only sync the cache if the drive is still open */ cam_periph_lock(periph); if ((softc->flags & NDA_FLAG_OPEN) == 0) { cam_periph_unlock(periph); continue; } ccb = cam_periph_getccb(periph, CAM_PRIORITY_NORMAL); nda_nvme_flush(softc, &ccb->nvmeio); error = cam_periph_runccb(ccb, ndaerror, /*cam_flags*/0, /*sense_flags*/ SF_NO_RECOVERY | SF_NO_RETRY, softc->disk->d_devstat); if (error != 0) xpt_print(periph->path, "Synchronize cache failed\n"); xpt_release_ccb(ccb); cam_periph_unlock(periph); } } static void ndashutdown(void *arg, int howto) { ndaflush(); } static void ndasuspend(void *arg) { ndaflush(); } Index: head/sys/cam/scsi/scsi_da.c =================================================================== --- head/sys/cam/scsi/scsi_da.c (revision 329818) +++ head/sys/cam/scsi/scsi_da.c (revision 329819) @@ -1,6273 +1,6273 @@ /*- * Implementation of SCSI Direct Access Peripheral driver for CAM. * * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 1997 Justin T. Gibbs. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions, and the following disclaimer, * without modification, immediately at the beginning of the file. * 2. The name of the author may not be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #ifdef _KERNEL #include "opt_da.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #endif /* _KERNEL */ #ifndef _KERNEL #include #include #endif /* _KERNEL */ #include #include #include #include #include #include #include #include #ifdef _KERNEL /* * Note that there are probe ordering dependencies here. The order isn't * controlled by this enumeration, but by explicit state transitions in * dastart() and dadone(). Here are some of the dependencies: * * 1. RC should come first, before RC16, unless there is evidence that RC16 * is supported. * 2. BDC needs to come before any of the ATA probes, or the ZONE probe. * 3. The ATA probes should go in this order: * ATA -> LOGDIR -> IDDIR -> SUP -> ATA_ZONE */ typedef enum { DA_STATE_PROBE_WP, DA_STATE_PROBE_RC, DA_STATE_PROBE_RC16, DA_STATE_PROBE_LBP, DA_STATE_PROBE_BLK_LIMITS, DA_STATE_PROBE_BDC, DA_STATE_PROBE_ATA, DA_STATE_PROBE_ATA_LOGDIR, DA_STATE_PROBE_ATA_IDDIR, DA_STATE_PROBE_ATA_SUP, DA_STATE_PROBE_ATA_ZONE, DA_STATE_PROBE_ZONE, DA_STATE_NORMAL } da_state; typedef enum { DA_FLAG_PACK_INVALID = 0x000001, DA_FLAG_NEW_PACK = 0x000002, DA_FLAG_PACK_LOCKED = 0x000004, DA_FLAG_PACK_REMOVABLE = 0x000008, DA_FLAG_NEED_OTAG = 0x000020, DA_FLAG_WAS_OTAG = 0x000040, DA_FLAG_RETRY_UA = 0x000080, DA_FLAG_OPEN = 0x000100, DA_FLAG_SCTX_INIT = 0x000200, DA_FLAG_CAN_RC16 = 0x000400, DA_FLAG_PROBED = 0x000800, DA_FLAG_DIRTY = 0x001000, DA_FLAG_ANNOUNCED = 0x002000, DA_FLAG_CAN_ATA_DMA = 0x004000, DA_FLAG_CAN_ATA_LOG = 0x008000, DA_FLAG_CAN_ATA_IDLOG = 0x010000, DA_FLAG_CAN_ATA_SUPCAP = 0x020000, DA_FLAG_CAN_ATA_ZONE = 0x040000 } da_flags; typedef enum { DA_Q_NONE = 0x00, DA_Q_NO_SYNC_CACHE = 0x01, DA_Q_NO_6_BYTE = 0x02, DA_Q_NO_PREVENT = 0x04, DA_Q_4K = 0x08, DA_Q_NO_RC16 = 0x10, DA_Q_NO_UNMAP = 0x20, DA_Q_RETRY_BUSY = 0x40, DA_Q_SMR_DM = 0x80, DA_Q_STRICT_UNMAP = 0x100 } da_quirks; #define DA_Q_BIT_STRING \ "\020" \ "\001NO_SYNC_CACHE" \ "\002NO_6_BYTE" \ "\003NO_PREVENT" \ "\0044K" \ "\005NO_RC16" \ "\006NO_UNMAP" \ "\007RETRY_BUSY" \ "\010SMR_DM" \ "\011STRICT_UNMAP" typedef enum { DA_CCB_PROBE_RC = 0x01, DA_CCB_PROBE_RC16 = 0x02, DA_CCB_PROBE_LBP = 0x03, DA_CCB_PROBE_BLK_LIMITS = 0x04, DA_CCB_PROBE_BDC = 0x05, DA_CCB_PROBE_ATA = 0x06, DA_CCB_BUFFER_IO = 0x07, DA_CCB_DUMP = 0x0A, DA_CCB_DELETE = 0x0B, DA_CCB_TUR = 0x0C, DA_CCB_PROBE_ZONE = 0x0D, DA_CCB_PROBE_ATA_LOGDIR = 0x0E, DA_CCB_PROBE_ATA_IDDIR = 0x0F, DA_CCB_PROBE_ATA_SUP = 0x10, DA_CCB_PROBE_ATA_ZONE = 0x11, DA_CCB_PROBE_WP = 0x12, DA_CCB_TYPE_MASK = 0x1F, DA_CCB_RETRY_UA = 0x20 } da_ccb_state; /* * Order here is important for method choice * * We prefer ATA_TRIM as tests run against a Sandforce 2281 SSD attached to * LSI 2008 (mps) controller (FW: v12, Drv: v14) resulted 20% quicker deletes * using ATA_TRIM than the corresponding UNMAP results for a real world mysql * import taking 5mins. * */ typedef enum { DA_DELETE_NONE, DA_DELETE_DISABLE, DA_DELETE_ATA_TRIM, DA_DELETE_UNMAP, DA_DELETE_WS16, DA_DELETE_WS10, DA_DELETE_ZERO, DA_DELETE_MIN = DA_DELETE_ATA_TRIM, DA_DELETE_MAX = DA_DELETE_ZERO } da_delete_methods; /* * For SCSI, host managed drives show up as a separate device type. For * ATA, host managed drives also have a different device signature. * XXX KDM figure out the ATA host managed signature. */ typedef enum { DA_ZONE_NONE = 0x00, DA_ZONE_DRIVE_MANAGED = 0x01, DA_ZONE_HOST_AWARE = 0x02, DA_ZONE_HOST_MANAGED = 0x03 } da_zone_mode; /* * We distinguish between these interface cases in addition to the drive type: * o ATA drive behind a SCSI translation layer that knows about ZBC/ZAC * o ATA drive behind a SCSI translation layer that does not know about * ZBC/ZAC, and so needs to be managed via ATA passthrough. In this * case, we would need to share the ATA code with the ada(4) driver. * o SCSI drive. */ typedef enum { DA_ZONE_IF_SCSI, DA_ZONE_IF_ATA_PASS, DA_ZONE_IF_ATA_SAT, } da_zone_interface; typedef enum { DA_ZONE_FLAG_RZ_SUP = 0x0001, DA_ZONE_FLAG_OPEN_SUP = 0x0002, DA_ZONE_FLAG_CLOSE_SUP = 0x0004, DA_ZONE_FLAG_FINISH_SUP = 0x0008, DA_ZONE_FLAG_RWP_SUP = 0x0010, DA_ZONE_FLAG_SUP_MASK = (DA_ZONE_FLAG_RZ_SUP | DA_ZONE_FLAG_OPEN_SUP | DA_ZONE_FLAG_CLOSE_SUP | DA_ZONE_FLAG_FINISH_SUP | DA_ZONE_FLAG_RWP_SUP), DA_ZONE_FLAG_URSWRZ = 0x0020, DA_ZONE_FLAG_OPT_SEQ_SET = 0x0040, DA_ZONE_FLAG_OPT_NONSEQ_SET = 0x0080, DA_ZONE_FLAG_MAX_SEQ_SET = 0x0100, DA_ZONE_FLAG_SET_MASK = (DA_ZONE_FLAG_OPT_SEQ_SET | DA_ZONE_FLAG_OPT_NONSEQ_SET | DA_ZONE_FLAG_MAX_SEQ_SET) } da_zone_flags; static struct da_zone_desc { da_zone_flags value; const char *desc; } da_zone_desc_table[] = { {DA_ZONE_FLAG_RZ_SUP, "Report Zones" }, {DA_ZONE_FLAG_OPEN_SUP, "Open" }, {DA_ZONE_FLAG_CLOSE_SUP, "Close" }, {DA_ZONE_FLAG_FINISH_SUP, "Finish" }, {DA_ZONE_FLAG_RWP_SUP, "Reset Write Pointer" }, }; typedef void da_delete_func_t (struct cam_periph *periph, union ccb *ccb, struct bio *bp); static da_delete_func_t da_delete_trim; static da_delete_func_t da_delete_unmap; static da_delete_func_t da_delete_ws; static const void * da_delete_functions[] = { NULL, NULL, da_delete_trim, da_delete_unmap, da_delete_ws, da_delete_ws, da_delete_ws }; static const char *da_delete_method_names[] = { "NONE", "DISABLE", "ATA_TRIM", "UNMAP", "WS16", "WS10", "ZERO" }; static const char *da_delete_method_desc[] = { "NONE", "DISABLED", "ATA TRIM", "UNMAP", "WRITE SAME(16) with UNMAP", "WRITE SAME(10) with UNMAP", "ZERO" }; /* Offsets into our private area for storing information */ #define ccb_state ppriv_field0 #define ccb_bp ppriv_ptr1 struct disk_params { u_int8_t heads; u_int32_t cylinders; u_int8_t secs_per_track; u_int32_t secsize; /* Number of bytes/sector */ u_int64_t sectors; /* total number sectors */ u_int stripesize; u_int stripeoffset; }; #define UNMAP_RANGE_MAX 0xffffffff #define UNMAP_HEAD_SIZE 8 #define UNMAP_RANGE_SIZE 16 #define UNMAP_MAX_RANGES 2048 /* Protocol Max is 4095 */ #define UNMAP_BUF_SIZE ((UNMAP_MAX_RANGES * UNMAP_RANGE_SIZE) + \ UNMAP_HEAD_SIZE) #define WS10_MAX_BLKS 0xffff #define WS16_MAX_BLKS 0xffffffff #define ATA_TRIM_MAX_RANGES ((UNMAP_BUF_SIZE / \ (ATA_DSM_RANGE_SIZE * ATA_DSM_BLK_SIZE)) * ATA_DSM_BLK_SIZE) #define DA_WORK_TUR (1 << 16) typedef enum { DA_REF_OPEN = 1, DA_REF_OPEN_HOLD, DA_REF_CLOSE_HOLD, DA_REF_PROBE_HOLD, DA_REF_TUR, DA_REF_GEOM, DA_REF_SYSCTL, DA_REF_REPROBE, DA_REF_MAX /* KEEP LAST */ } da_ref_token; struct da_softc { struct cam_iosched_softc *cam_iosched; struct bio_queue_head delete_run_queue; LIST_HEAD(, ccb_hdr) pending_ccbs; int refcount; /* Active xpt_action() calls */ da_state state; da_flags flags; da_quirks quirks; int minimum_cmd_size; int error_inject; int trim_max_ranges; int delete_available; /* Delete methods possibly available */ da_zone_mode zone_mode; da_zone_interface zone_interface; da_zone_flags zone_flags; struct ata_gp_log_dir ata_logdir; int valid_logdir_len; struct ata_identify_log_pages ata_iddir; int valid_iddir_len; uint64_t optimal_seq_zones; uint64_t optimal_nonseq_zones; uint64_t max_seq_zones; u_int maxio; uint32_t unmap_max_ranges; uint32_t unmap_max_lba; /* Max LBAs in UNMAP req */ uint32_t unmap_gran; uint32_t unmap_gran_align; uint64_t ws_max_blks; da_delete_methods delete_method_pref; da_delete_methods delete_method; da_delete_func_t *delete_func; int unmappedio; int rotating; struct disk_params params; struct disk *disk; union ccb saved_ccb; struct task sysctl_task; struct sysctl_ctx_list sysctl_ctx; struct sysctl_oid *sysctl_tree; struct callout sendordered_c; uint64_t wwpn; uint8_t unmap_buf[UNMAP_BUF_SIZE]; struct scsi_read_capacity_data_long rcaplong; struct callout mediapoll_c; int ref_flags[DA_REF_MAX]; #ifdef CAM_IO_STATS struct sysctl_ctx_list sysctl_stats_ctx; struct sysctl_oid *sysctl_stats_tree; u_int errors; u_int timeouts; u_int invalidations; #endif #define DA_ANNOUNCETMP_SZ 80 char announce_temp[DA_ANNOUNCETMP_SZ]; #define DA_ANNOUNCE_SZ 400 char announcebuf[DA_ANNOUNCE_SZ]; }; #define dadeleteflag(softc, delete_method, enable) \ if (enable) { \ softc->delete_available |= (1 << delete_method); \ } else { \ softc->delete_available &= ~(1 << delete_method); \ } struct da_quirk_entry { struct scsi_inquiry_pattern inq_pat; da_quirks quirks; }; static const char quantum[] = "QUANTUM"; static const char microp[] = "MICROP"; static struct da_quirk_entry da_quirk_table[] = { /* SPI, FC devices */ { /* * Fujitsu M2513A MO drives. * Tested devices: M2513A2 firmware versions 1200 & 1300. * (dip switch selects whether T_DIRECT or T_OPTICAL device) * Reported by: W.Scholten */ {T_DIRECT, SIP_MEDIA_REMOVABLE, "FUJITSU", "M2513A", "*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE }, { /* See above. */ {T_OPTICAL, SIP_MEDIA_REMOVABLE, "FUJITSU", "M2513A", "*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE }, { /* * This particular Fujitsu drive doesn't like the * synchronize cache command. * Reported by: Tom Jackson */ {T_DIRECT, SIP_MEDIA_FIXED, "FUJITSU", "M2954*", "*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE }, { /* * This drive doesn't like the synchronize cache command * either. Reported by: Matthew Jacob * in NetBSD PR kern/6027, August 24, 1998. */ {T_DIRECT, SIP_MEDIA_FIXED, microp, "2217*", "*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE }, { /* * This drive doesn't like the synchronize cache command * either. Reported by: Hellmuth Michaelis (hm@kts.org) * (PR 8882). */ {T_DIRECT, SIP_MEDIA_FIXED, microp, "2112*", "*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE }, { /* * Doesn't like the synchronize cache command. * Reported by: Blaz Zupan */ {T_DIRECT, SIP_MEDIA_FIXED, "NEC", "D3847*", "*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE }, { /* * Doesn't like the synchronize cache command. * Reported by: Blaz Zupan */ {T_DIRECT, SIP_MEDIA_FIXED, quantum, "MAVERICK 540S", "*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE }, { /* * Doesn't like the synchronize cache command. */ {T_DIRECT, SIP_MEDIA_FIXED, quantum, "LPS525S", "*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE }, { /* * Doesn't like the synchronize cache command. * Reported by: walter@pelissero.de */ {T_DIRECT, SIP_MEDIA_FIXED, quantum, "LPS540S", "*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE }, { /* * Doesn't work correctly with 6 byte reads/writes. * Returns illegal request, and points to byte 9 of the * 6-byte CDB. * Reported by: Adam McDougall */ {T_DIRECT, SIP_MEDIA_FIXED, quantum, "VIKING 4*", "*"}, /*quirks*/ DA_Q_NO_6_BYTE }, { /* See above. */ {T_DIRECT, SIP_MEDIA_FIXED, quantum, "VIKING 2*", "*"}, /*quirks*/ DA_Q_NO_6_BYTE }, { /* * Doesn't like the synchronize cache command. * Reported by: walter@pelissero.de */ {T_DIRECT, SIP_MEDIA_FIXED, "CONNER", "CP3500*", "*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE }, { /* * The CISS RAID controllers do not support SYNC_CACHE */ {T_DIRECT, SIP_MEDIA_FIXED, "COMPAQ", "RAID*", "*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE }, { /* * The STEC SSDs sometimes hang on UNMAP. */ {T_DIRECT, SIP_MEDIA_FIXED, "STEC", "*", "*"}, /*quirks*/ DA_Q_NO_UNMAP }, { /* * VMware returns BUSY status when storage has transient * connectivity problems, so better wait. * Also VMware returns odd errors on misaligned UNMAPs. */ {T_DIRECT, SIP_MEDIA_FIXED, "VMware*", "*", "*"}, /*quirks*/ DA_Q_RETRY_BUSY | DA_Q_STRICT_UNMAP }, /* USB mass storage devices supported by umass(4) */ { /* * EXATELECOM (Sigmatel) i-Bead 100/105 USB Flash MP3 Player * PR: kern/51675 */ {T_DIRECT, SIP_MEDIA_REMOVABLE, "EXATEL", "i-BEAD10*", "*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE }, { /* * Power Quotient Int. (PQI) USB flash key * PR: kern/53067 */ {T_DIRECT, SIP_MEDIA_REMOVABLE, "Generic*", "USB Flash Disk*", "*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE }, { /* * Creative Nomad MUVO mp3 player (USB) * PR: kern/53094 */ {T_DIRECT, SIP_MEDIA_REMOVABLE, "CREATIVE", "NOMAD_MUVO", "*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE|DA_Q_NO_PREVENT }, { /* * Jungsoft NEXDISK USB flash key * PR: kern/54737 */ {T_DIRECT, SIP_MEDIA_REMOVABLE, "JUNGSOFT", "NEXDISK*", "*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE }, { /* * FreeDik USB Mini Data Drive * PR: kern/54786 */ {T_DIRECT, SIP_MEDIA_REMOVABLE, "FreeDik*", "Mini Data Drive", "*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE }, { /* * Sigmatel USB Flash MP3 Player * PR: kern/57046 */ {T_DIRECT, SIP_MEDIA_REMOVABLE, "SigmaTel", "MSCN", "*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE|DA_Q_NO_PREVENT }, { /* * Neuros USB Digital Audio Computer * PR: kern/63645 */ {T_DIRECT, SIP_MEDIA_REMOVABLE, "NEUROS", "dig. audio comp.", "*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE }, { /* * SEAGRAND NP-900 MP3 Player * PR: kern/64563 */ {T_DIRECT, SIP_MEDIA_REMOVABLE, "SEAGRAND", "NP-900*", "*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE|DA_Q_NO_PREVENT }, { /* * iRiver iFP MP3 player (with UMS Firmware) * PR: kern/54881, i386/63941, kern/66124 */ {T_DIRECT, SIP_MEDIA_REMOVABLE, "iRiver", "iFP*", "*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE }, { /* * Frontier Labs NEX IA+ Digital Audio Player, rev 1.10/0.01 * PR: kern/70158 */ {T_DIRECT, SIP_MEDIA_REMOVABLE, "FL" , "Nex*", "*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE }, { /* * ZICPlay USB MP3 Player with FM * PR: kern/75057 */ {T_DIRECT, SIP_MEDIA_REMOVABLE, "ACTIONS*" , "USB DISK*", "*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE }, { /* * TEAC USB floppy mechanisms */ {T_DIRECT, SIP_MEDIA_REMOVABLE, "TEAC" , "FD-05*", "*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE }, { /* * Kingston DataTraveler II+ USB Pen-Drive. * Reported by: Pawel Jakub Dawidek */ {T_DIRECT, SIP_MEDIA_REMOVABLE, "Kingston" , "DataTraveler II+", "*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE }, { /* * USB DISK Pro PMAP * Reported by: jhs * PR: usb/96381 */ {T_DIRECT, SIP_MEDIA_REMOVABLE, " ", "USB DISK Pro", "PMAP"}, /*quirks*/ DA_Q_NO_SYNC_CACHE }, { /* * Motorola E398 Mobile Phone (TransFlash memory card). * Reported by: Wojciech A. Koszek * PR: usb/89889 */ {T_DIRECT, SIP_MEDIA_REMOVABLE, "Motorola" , "Motorola Phone", "*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE }, { /* * Qware BeatZkey! Pro * PR: usb/79164 */ {T_DIRECT, SIP_MEDIA_REMOVABLE, "GENERIC", "USB DISK DEVICE", "*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE }, { /* * Time DPA20B 1GB MP3 Player * PR: usb/81846 */ {T_DIRECT, SIP_MEDIA_REMOVABLE, "USB2.0*", "(FS) FLASH DISK*", "*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE }, { /* * Samsung USB key 128Mb * PR: usb/90081 */ {T_DIRECT, SIP_MEDIA_REMOVABLE, "USB-DISK", "FreeDik-FlashUsb", "*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE }, { /* * Kingston DataTraveler 2.0 USB Flash memory. * PR: usb/89196 */ {T_DIRECT, SIP_MEDIA_REMOVABLE, "Kingston", "DataTraveler 2.0", "*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE }, { /* * Creative MUVO Slim mp3 player (USB) * PR: usb/86131 */ {T_DIRECT, SIP_MEDIA_REMOVABLE, "CREATIVE", "MuVo Slim", "*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE|DA_Q_NO_PREVENT }, { /* * United MP5512 Portable MP3 Player (2-in-1 USB DISK/MP3) * PR: usb/80487 */ {T_DIRECT, SIP_MEDIA_REMOVABLE, "Generic*", "MUSIC DISK", "*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE }, { /* * SanDisk Micro Cruzer 128MB * PR: usb/75970 */ {T_DIRECT, SIP_MEDIA_REMOVABLE, "SanDisk" , "Micro Cruzer", "*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE }, { /* * TOSHIBA TransMemory USB sticks * PR: kern/94660 */ {T_DIRECT, SIP_MEDIA_REMOVABLE, "TOSHIBA", "TransMemory", "*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE }, { /* * PNY USB 3.0 Flash Drives */ {T_DIRECT, SIP_MEDIA_REMOVABLE, "PNY", "USB 3.0 FD*", "*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE | DA_Q_NO_RC16 }, { /* * PNY USB Flash keys * PR: usb/75578, usb/72344, usb/65436 */ {T_DIRECT, SIP_MEDIA_REMOVABLE, "*" , "USB DISK*", "*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE }, { /* * Genesys GL3224 */ {T_DIRECT, SIP_MEDIA_REMOVABLE, "Generic*", "STORAGE DEVICE*", "120?"}, /*quirks*/ DA_Q_NO_SYNC_CACHE | DA_Q_4K | DA_Q_NO_RC16 }, { /* * Genesys 6-in-1 Card Reader * PR: usb/94647 */ {T_DIRECT, SIP_MEDIA_REMOVABLE, "Generic*", "STORAGE DEVICE*", "*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE }, { /* * Rekam Digital CAMERA * PR: usb/98713 */ {T_DIRECT, SIP_MEDIA_REMOVABLE, "CAMERA*", "4MP-9J6*", "*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE }, { /* * iRiver H10 MP3 player * PR: usb/102547 */ {T_DIRECT, SIP_MEDIA_REMOVABLE, "iriver", "H10*", "*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE }, { /* * iRiver U10 MP3 player * PR: usb/92306 */ {T_DIRECT, SIP_MEDIA_REMOVABLE, "iriver", "U10*", "*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE }, { /* * X-Micro Flash Disk * PR: usb/96901 */ {T_DIRECT, SIP_MEDIA_REMOVABLE, "X-Micro", "Flash Disk", "*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE }, { /* * EasyMP3 EM732X USB 2.0 Flash MP3 Player * PR: usb/96546 */ {T_DIRECT, SIP_MEDIA_REMOVABLE, "EM732X", "MP3 Player*", "1.00"}, /*quirks*/ DA_Q_NO_SYNC_CACHE }, { /* * Denver MP3 player * PR: usb/107101 */ {T_DIRECT, SIP_MEDIA_REMOVABLE, "DENVER", "MP3 PLAYER", "*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE }, { /* * Philips USB Key Audio KEY013 * PR: usb/68412 */ {T_DIRECT, SIP_MEDIA_REMOVABLE, "PHILIPS", "Key*", "*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE | DA_Q_NO_PREVENT }, { /* * JNC MP3 Player * PR: usb/94439 */ {T_DIRECT, SIP_MEDIA_REMOVABLE, "JNC*" , "MP3 Player*", "*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE }, { /* * SAMSUNG MP0402H * PR: usb/108427 */ {T_DIRECT, SIP_MEDIA_FIXED, "SAMSUNG", "MP0402H", "*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE }, { /* * I/O Magic USB flash - Giga Bank * PR: usb/108810 */ {T_DIRECT, SIP_MEDIA_FIXED, "GS-Magic", "stor*", "*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE }, { /* * JoyFly 128mb USB Flash Drive * PR: 96133 */ {T_DIRECT, SIP_MEDIA_REMOVABLE, "USB 2.0", "Flash Disk*", "*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE }, { /* * ChipsBnk usb stick * PR: 103702 */ {T_DIRECT, SIP_MEDIA_REMOVABLE, "ChipsBnk", "USB*", "*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE }, { /* * Storcase (Kingston) InfoStation IFS FC2/SATA-R 201A * PR: 129858 */ {T_DIRECT, SIP_MEDIA_FIXED, "IFS", "FC2/SATA-R*", "*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE }, { /* * Samsung YP-U3 mp3-player * PR: 125398 */ {T_DIRECT, SIP_MEDIA_REMOVABLE, "Samsung", "YP-U3", "*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE }, { {T_DIRECT, SIP_MEDIA_REMOVABLE, "Netac", "OnlyDisk*", "2000"}, /*quirks*/ DA_Q_NO_SYNC_CACHE }, { /* * Sony Cyber-Shot DSC cameras * PR: usb/137035 */ {T_DIRECT, SIP_MEDIA_REMOVABLE, "Sony", "Sony DSC", "*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE | DA_Q_NO_PREVENT }, { {T_DIRECT, SIP_MEDIA_REMOVABLE, "Kingston", "DataTraveler G3", "1.00"}, /*quirks*/ DA_Q_NO_PREVENT }, { /* At least several Transcent USB sticks lie on RC16. */ {T_DIRECT, SIP_MEDIA_REMOVABLE, "JetFlash", "Transcend*", "*"}, /*quirks*/ DA_Q_NO_RC16 }, { /* * I-O Data USB Flash Disk * PR: usb/211716 */ {T_DIRECT, SIP_MEDIA_REMOVABLE, "I-O DATA", "USB Flash Disk*", "*"}, /*quirks*/ DA_Q_NO_RC16 }, /* ATA/SATA devices over SAS/USB/... */ { /* Hitachi Advanced Format (4k) drives */ { T_DIRECT, SIP_MEDIA_FIXED, "Hitachi", "H??????????E3*", "*" }, /*quirks*/DA_Q_4K }, { /* Micron Advanced Format (4k) drives */ { T_DIRECT, SIP_MEDIA_FIXED, "ATA", "Micron 5100 MTFDDAK*", "*" }, /*quirks*/DA_Q_4K }, { /* Samsung Advanced Format (4k) drives */ { T_DIRECT, SIP_MEDIA_FIXED, "ATA", "SAMSUNG HD155UI*", "*" }, /*quirks*/DA_Q_4K }, { /* Samsung Advanced Format (4k) drives */ { T_DIRECT, SIP_MEDIA_FIXED, "SAMSUNG", "HD155UI*", "*" }, /*quirks*/DA_Q_4K }, { /* Samsung Advanced Format (4k) drives */ { T_DIRECT, SIP_MEDIA_FIXED, "ATA", "SAMSUNG HD204UI*", "*" }, /*quirks*/DA_Q_4K }, { /* Samsung Advanced Format (4k) drives */ { T_DIRECT, SIP_MEDIA_FIXED, "SAMSUNG", "HD204UI*", "*" }, /*quirks*/DA_Q_4K }, { /* Seagate Barracuda Green Advanced Format (4k) drives */ { T_DIRECT, SIP_MEDIA_FIXED, "ATA", "ST????DL*", "*" }, /*quirks*/DA_Q_4K }, { /* Seagate Barracuda Green Advanced Format (4k) drives */ { T_DIRECT, SIP_MEDIA_FIXED, "ST????DL", "*", "*" }, /*quirks*/DA_Q_4K }, { /* Seagate Barracuda Green Advanced Format (4k) drives */ { T_DIRECT, SIP_MEDIA_FIXED, "ATA", "ST???DM*", "*" }, /*quirks*/DA_Q_4K }, { /* Seagate Barracuda Green Advanced Format (4k) drives */ { T_DIRECT, SIP_MEDIA_FIXED, "ST???DM*", "*", "*" }, /*quirks*/DA_Q_4K }, { /* Seagate Barracuda Green Advanced Format (4k) drives */ { T_DIRECT, SIP_MEDIA_FIXED, "ATA", "ST????DM*", "*" }, /*quirks*/DA_Q_4K }, { /* Seagate Barracuda Green Advanced Format (4k) drives */ { T_DIRECT, SIP_MEDIA_FIXED, "ST????DM", "*", "*" }, /*quirks*/DA_Q_4K }, { /* Seagate Momentus Advanced Format (4k) drives */ { T_DIRECT, SIP_MEDIA_FIXED, "ATA", "ST9500423AS*", "*" }, /*quirks*/DA_Q_4K }, { /* Seagate Momentus Advanced Format (4k) drives */ { T_DIRECT, SIP_MEDIA_FIXED, "ST950042", "3AS*", "*" }, /*quirks*/DA_Q_4K }, { /* Seagate Momentus Advanced Format (4k) drives */ { T_DIRECT, SIP_MEDIA_FIXED, "ATA", "ST9500424AS*", "*" }, /*quirks*/DA_Q_4K }, { /* Seagate Momentus Advanced Format (4k) drives */ { T_DIRECT, SIP_MEDIA_FIXED, "ST950042", "4AS*", "*" }, /*quirks*/DA_Q_4K }, { /* Seagate Momentus Advanced Format (4k) drives */ { T_DIRECT, SIP_MEDIA_FIXED, "ATA", "ST9640423AS*", "*" }, /*quirks*/DA_Q_4K }, { /* Seagate Momentus Advanced Format (4k) drives */ { T_DIRECT, SIP_MEDIA_FIXED, "ST964042", "3AS*", "*" }, /*quirks*/DA_Q_4K }, { /* Seagate Momentus Advanced Format (4k) drives */ { T_DIRECT, SIP_MEDIA_FIXED, "ATA", "ST9640424AS*", "*" }, /*quirks*/DA_Q_4K }, { /* Seagate Momentus Advanced Format (4k) drives */ { T_DIRECT, SIP_MEDIA_FIXED, "ST964042", "4AS*", "*" }, /*quirks*/DA_Q_4K }, { /* Seagate Momentus Advanced Format (4k) drives */ { T_DIRECT, SIP_MEDIA_FIXED, "ATA", "ST9750420AS*", "*" }, /*quirks*/DA_Q_4K }, { /* Seagate Momentus Advanced Format (4k) drives */ { T_DIRECT, SIP_MEDIA_FIXED, "ST975042", "0AS*", "*" }, /*quirks*/DA_Q_4K }, { /* Seagate Momentus Advanced Format (4k) drives */ { T_DIRECT, SIP_MEDIA_FIXED, "ATA", "ST9750422AS*", "*" }, /*quirks*/DA_Q_4K }, { /* Seagate Momentus Advanced Format (4k) drives */ { T_DIRECT, SIP_MEDIA_FIXED, "ST975042", "2AS*", "*" }, /*quirks*/DA_Q_4K }, { /* Seagate Momentus Advanced Format (4k) drives */ { T_DIRECT, SIP_MEDIA_FIXED, "ATA", "ST9750423AS*", "*" }, /*quirks*/DA_Q_4K }, { /* Seagate Momentus Advanced Format (4k) drives */ { T_DIRECT, SIP_MEDIA_FIXED, "ST975042", "3AS*", "*" }, /*quirks*/DA_Q_4K }, { /* Seagate Momentus Thin Advanced Format (4k) drives */ { T_DIRECT, SIP_MEDIA_FIXED, "ATA", "ST???LT*", "*" }, /*quirks*/DA_Q_4K }, { /* Seagate Momentus Thin Advanced Format (4k) drives */ { T_DIRECT, SIP_MEDIA_FIXED, "ST???LT*", "*", "*" }, /*quirks*/DA_Q_4K }, { /* WDC Caviar Green Advanced Format (4k) drives */ { T_DIRECT, SIP_MEDIA_FIXED, "ATA", "WDC WD????RS*", "*" }, /*quirks*/DA_Q_4K }, { /* WDC Caviar Green Advanced Format (4k) drives */ { T_DIRECT, SIP_MEDIA_FIXED, "WDC WD??", "??RS*", "*" }, /*quirks*/DA_Q_4K }, { /* WDC Caviar Green Advanced Format (4k) drives */ { T_DIRECT, SIP_MEDIA_FIXED, "ATA", "WDC WD????RX*", "*" }, /*quirks*/DA_Q_4K }, { /* WDC Caviar Green Advanced Format (4k) drives */ { T_DIRECT, SIP_MEDIA_FIXED, "WDC WD??", "??RX*", "*" }, /*quirks*/DA_Q_4K }, { /* WDC Caviar Green Advanced Format (4k) drives */ { T_DIRECT, SIP_MEDIA_FIXED, "ATA", "WDC WD??????RS*", "*" }, /*quirks*/DA_Q_4K }, { /* WDC Caviar Green Advanced Format (4k) drives */ { T_DIRECT, SIP_MEDIA_FIXED, "WDC WD??", "????RS*", "*" }, /*quirks*/DA_Q_4K }, { /* WDC Caviar Green Advanced Format (4k) drives */ { T_DIRECT, SIP_MEDIA_FIXED, "ATA", "WDC WD??????RX*", "*" }, /*quirks*/DA_Q_4K }, { /* WDC Caviar Green Advanced Format (4k) drives */ { T_DIRECT, SIP_MEDIA_FIXED, "WDC WD??", "????RX*", "*" }, /*quirks*/DA_Q_4K }, { /* WDC Scorpio Black Advanced Format (4k) drives */ { T_DIRECT, SIP_MEDIA_FIXED, "ATA", "WDC WD???PKT*", "*" }, /*quirks*/DA_Q_4K }, { /* WDC Scorpio Black Advanced Format (4k) drives */ { T_DIRECT, SIP_MEDIA_FIXED, "WDC WD??", "?PKT*", "*" }, /*quirks*/DA_Q_4K }, { /* WDC Scorpio Black Advanced Format (4k) drives */ { T_DIRECT, SIP_MEDIA_FIXED, "ATA", "WDC WD?????PKT*", "*" }, /*quirks*/DA_Q_4K }, { /* WDC Scorpio Black Advanced Format (4k) drives */ { T_DIRECT, SIP_MEDIA_FIXED, "WDC WD??", "???PKT*", "*" }, /*quirks*/DA_Q_4K }, { /* WDC Scorpio Blue Advanced Format (4k) drives */ { T_DIRECT, SIP_MEDIA_FIXED, "ATA", "WDC WD???PVT*", "*" }, /*quirks*/DA_Q_4K }, { /* WDC Scorpio Blue Advanced Format (4k) drives */ { T_DIRECT, SIP_MEDIA_FIXED, "WDC WD??", "?PVT*", "*" }, /*quirks*/DA_Q_4K }, { /* WDC Scorpio Blue Advanced Format (4k) drives */ { T_DIRECT, SIP_MEDIA_FIXED, "ATA", "WDC WD?????PVT*", "*" }, /*quirks*/DA_Q_4K }, { /* WDC Scorpio Blue Advanced Format (4k) drives */ { T_DIRECT, SIP_MEDIA_FIXED, "WDC WD??", "???PVT*", "*" }, /*quirks*/DA_Q_4K }, { /* * Olympus FE-210 camera */ {T_DIRECT, SIP_MEDIA_REMOVABLE, "OLYMPUS", "FE210*", "*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE }, { /* * LG UP3S MP3 player */ {T_DIRECT, SIP_MEDIA_REMOVABLE, "LG", "UP3S", "*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE }, { /* * Laser MP3-2GA13 MP3 player */ {T_DIRECT, SIP_MEDIA_REMOVABLE, "USB 2.0", "(HS) Flash Disk", "*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE }, { /* * LaCie external 250GB Hard drive des by Porsche * Submitted by: Ben Stuyts * PR: 121474 */ {T_DIRECT, SIP_MEDIA_FIXED, "SAMSUNG", "HM250JI", "*"}, /*quirks*/ DA_Q_NO_SYNC_CACHE }, /* SATA SSDs */ { /* * Corsair Force 2 SSDs * 4k optimised & trim only works in 4k requests + 4k aligned */ { T_DIRECT, SIP_MEDIA_FIXED, "ATA", "Corsair CSSD-F*", "*" }, /*quirks*/DA_Q_4K }, { /* * Corsair Force 3 SSDs * 4k optimised & trim only works in 4k requests + 4k aligned */ { T_DIRECT, SIP_MEDIA_FIXED, "ATA", "Corsair Force 3*", "*" }, /*quirks*/DA_Q_4K }, { /* * Corsair Neutron GTX SSDs * 4k optimised & trim only works in 4k requests + 4k aligned */ { T_DIRECT, SIP_MEDIA_FIXED, "*", "Corsair Neutron GTX*", "*" }, /*quirks*/DA_Q_4K }, { /* * Corsair Force GT & GS SSDs * 4k optimised & trim only works in 4k requests + 4k aligned */ { T_DIRECT, SIP_MEDIA_FIXED, "ATA", "Corsair Force G*", "*" }, /*quirks*/DA_Q_4K }, { /* * Crucial M4 SSDs * 4k optimised & trim only works in 4k requests + 4k aligned */ { T_DIRECT, SIP_MEDIA_FIXED, "ATA", "M4-CT???M4SSD2*", "*" }, /*quirks*/DA_Q_4K }, { /* * Crucial RealSSD C300 SSDs * 4k optimised */ { T_DIRECT, SIP_MEDIA_FIXED, "ATA", "C300-CTFDDAC???MAG*", "*" }, /*quirks*/DA_Q_4K }, { /* * Intel 320 Series SSDs * 4k optimised & trim only works in 4k requests + 4k aligned */ { T_DIRECT, SIP_MEDIA_FIXED, "ATA", "INTEL SSDSA2CW*", "*" }, /*quirks*/DA_Q_4K }, { /* * Intel 330 Series SSDs * 4k optimised & trim only works in 4k requests + 4k aligned */ { T_DIRECT, SIP_MEDIA_FIXED, "ATA", "INTEL SSDSC2CT*", "*" }, /*quirks*/DA_Q_4K }, { /* * Intel 510 Series SSDs * 4k optimised & trim only works in 4k requests + 4k aligned */ { T_DIRECT, SIP_MEDIA_FIXED, "ATA", "INTEL SSDSC2MH*", "*" }, /*quirks*/DA_Q_4K }, { /* * Intel 520 Series SSDs * 4k optimised & trim only works in 4k requests + 4k aligned */ { T_DIRECT, SIP_MEDIA_FIXED, "ATA", "INTEL SSDSC2BW*", "*" }, /*quirks*/DA_Q_4K }, { /* * Intel S3610 Series SSDs * 4k optimised & trim only works in 4k requests + 4k aligned */ { T_DIRECT, SIP_MEDIA_FIXED, "ATA", "INTEL SSDSC2BX*", "*" }, /*quirks*/DA_Q_4K }, { /* * Intel X25-M Series SSDs * 4k optimised & trim only works in 4k requests + 4k aligned */ { T_DIRECT, SIP_MEDIA_FIXED, "ATA", "INTEL SSDSA2M*", "*" }, /*quirks*/DA_Q_4K }, { /* * Kingston E100 Series SSDs * 4k optimised & trim only works in 4k requests + 4k aligned */ { T_DIRECT, SIP_MEDIA_FIXED, "ATA", "KINGSTON SE100S3*", "*" }, /*quirks*/DA_Q_4K }, { /* * Kingston HyperX 3k SSDs * 4k optimised & trim only works in 4k requests + 4k aligned */ { T_DIRECT, SIP_MEDIA_FIXED, "ATA", "KINGSTON SH103S3*", "*" }, /*quirks*/DA_Q_4K }, { /* * Marvell SSDs (entry taken from OpenSolaris) * 4k optimised & trim only works in 4k requests + 4k aligned */ { T_DIRECT, SIP_MEDIA_FIXED, "ATA", "MARVELL SD88SA02*", "*" }, /*quirks*/DA_Q_4K }, { /* * OCZ Agility 2 SSDs * 4k optimised & trim only works in 4k requests + 4k aligned */ { T_DIRECT, SIP_MEDIA_FIXED, "*", "OCZ-AGILITY2*", "*" }, /*quirks*/DA_Q_4K }, { /* * OCZ Agility 3 SSDs * 4k optimised & trim only works in 4k requests + 4k aligned */ { T_DIRECT, SIP_MEDIA_FIXED, "ATA", "OCZ-AGILITY3*", "*" }, /*quirks*/DA_Q_4K }, { /* * OCZ Deneva R Series SSDs * 4k optimised & trim only works in 4k requests + 4k aligned */ { T_DIRECT, SIP_MEDIA_FIXED, "ATA", "DENRSTE251M45*", "*" }, /*quirks*/DA_Q_4K }, { /* * OCZ Vertex 2 SSDs (inc pro series) * 4k optimised & trim only works in 4k requests + 4k aligned */ { T_DIRECT, SIP_MEDIA_FIXED, "ATA", "OCZ?VERTEX2*", "*" }, /*quirks*/DA_Q_4K }, { /* * OCZ Vertex 3 SSDs * 4k optimised & trim only works in 4k requests + 4k aligned */ { T_DIRECT, SIP_MEDIA_FIXED, "ATA", "OCZ-VERTEX3*", "*" }, /*quirks*/DA_Q_4K }, { /* * OCZ Vertex 4 SSDs * 4k optimised & trim only works in 4k requests + 4k aligned */ { T_DIRECT, SIP_MEDIA_FIXED, "ATA", "OCZ-VERTEX4*", "*" }, /*quirks*/DA_Q_4K }, { /* * Samsung 750 Series SSDs * 4k optimised & trim only works in 4k requests + 4k aligned */ { T_DIRECT, SIP_MEDIA_FIXED, "ATA", "Samsung SSD 750*", "*" }, /*quirks*/DA_Q_4K }, { /* * Samsung 830 Series SSDs * 4k optimised & trim only works in 4k requests + 4k aligned */ { T_DIRECT, SIP_MEDIA_FIXED, "ATA", "SAMSUNG SSD 830 Series*", "*" }, /*quirks*/DA_Q_4K }, { /* * Samsung 840 SSDs * 4k optimised & trim only works in 4k requests + 4k aligned */ { T_DIRECT, SIP_MEDIA_FIXED, "ATA", "Samsung SSD 840*", "*" }, /*quirks*/DA_Q_4K }, { /* * Samsung 845 SSDs * 4k optimised & trim only works in 4k requests + 4k aligned */ { T_DIRECT, SIP_MEDIA_FIXED, "ATA", "Samsung SSD 845*", "*" }, /*quirks*/DA_Q_4K }, { /* * Samsung 850 SSDs * 4k optimised & trim only works in 4k requests + 4k aligned */ { T_DIRECT, SIP_MEDIA_FIXED, "ATA", "Samsung SSD 850*", "*" }, /*quirks*/DA_Q_4K }, { /* * Samsung 843T Series SSDs (MZ7WD*) * Samsung PM851 Series SSDs (MZ7TE*) * Samsung PM853T Series SSDs (MZ7GE*) * Samsung SM863 Series SSDs (MZ7KM*) * 4k optimised */ { T_DIRECT, SIP_MEDIA_FIXED, "ATA", "SAMSUNG MZ7*", "*" }, /*quirks*/DA_Q_4K }, { /* * Same as for SAMSUNG MZ7* but enable the quirks for SSD * starting with MZ7* too */ { T_DIRECT, SIP_MEDIA_FIXED, "ATA", "MZ7*", "*" }, /*quirks*/DA_Q_4K }, { /* * SuperTalent TeraDrive CT SSDs * 4k optimised & trim only works in 4k requests + 4k aligned */ { T_DIRECT, SIP_MEDIA_FIXED, "ATA", "FTM??CT25H*", "*" }, /*quirks*/DA_Q_4K }, { /* * XceedIOPS SATA SSDs * 4k optimised */ { T_DIRECT, SIP_MEDIA_FIXED, "ATA", "SG9XCS2D*", "*" }, /*quirks*/DA_Q_4K }, { /* * Hama Innostor USB-Stick */ { T_DIRECT, SIP_MEDIA_REMOVABLE, "Innostor", "Innostor*", "*" }, /*quirks*/DA_Q_NO_RC16 }, { /* * Seagate Lamarr 8TB Shingled Magnetic Recording (SMR) * Drive Managed SATA hard drive. This drive doesn't report * in firmware that it is a drive managed SMR drive. */ { T_DIRECT, SIP_MEDIA_FIXED, "ATA", "ST8000AS000[23]*", "*" }, /*quirks*/DA_Q_SMR_DM }, { /* * MX-ES USB Drive by Mach Xtreme */ { T_DIRECT, SIP_MEDIA_REMOVABLE, "MX", "MXUB3*", "*"}, /*quirks*/DA_Q_NO_RC16 }, }; static disk_strategy_t dastrategy; static dumper_t dadump; static periph_init_t dainit; static void daasync(void *callback_arg, u_int32_t code, struct cam_path *path, void *arg); static void dasysctlinit(void *context, int pending); static int dasysctlsofttimeout(SYSCTL_HANDLER_ARGS); static int dacmdsizesysctl(SYSCTL_HANDLER_ARGS); static int dadeletemethodsysctl(SYSCTL_HANDLER_ARGS); static int dazonemodesysctl(SYSCTL_HANDLER_ARGS); static int dazonesupsysctl(SYSCTL_HANDLER_ARGS); static int dadeletemaxsysctl(SYSCTL_HANDLER_ARGS); static void dadeletemethodset(struct da_softc *softc, da_delete_methods delete_method); static off_t dadeletemaxsize(struct da_softc *softc, da_delete_methods delete_method); static void dadeletemethodchoose(struct da_softc *softc, da_delete_methods default_method); static void daprobedone(struct cam_periph *periph, union ccb *ccb); static periph_ctor_t daregister; static periph_dtor_t dacleanup; static periph_start_t dastart; static periph_oninv_t daoninvalidate; static void dazonedone(struct cam_periph *periph, union ccb *ccb); static void dadone(struct cam_periph *periph, union ccb *done_ccb); static int daerror(union ccb *ccb, u_int32_t cam_flags, u_int32_t sense_flags); static void daprevent(struct cam_periph *periph, int action); static void dareprobe(struct cam_periph *periph); static void dasetgeom(struct cam_periph *periph, uint32_t block_len, uint64_t maxsector, struct scsi_read_capacity_data_long *rcaplong, size_t rcap_size); static timeout_t dasendorderedtag; static void dashutdown(void *arg, int howto); static timeout_t damediapoll; #ifndef DA_DEFAULT_POLL_PERIOD #define DA_DEFAULT_POLL_PERIOD 3 #endif #ifndef DA_DEFAULT_TIMEOUT #define DA_DEFAULT_TIMEOUT 60 /* Timeout in seconds */ #endif #ifndef DA_DEFAULT_SOFTTIMEOUT #define DA_DEFAULT_SOFTTIMEOUT 0 #endif #ifndef DA_DEFAULT_RETRY #define DA_DEFAULT_RETRY 4 #endif #ifndef DA_DEFAULT_SEND_ORDERED #define DA_DEFAULT_SEND_ORDERED 1 #endif static int da_poll_period = DA_DEFAULT_POLL_PERIOD; static int da_retry_count = DA_DEFAULT_RETRY; static int da_default_timeout = DA_DEFAULT_TIMEOUT; static sbintime_t da_default_softtimeout = DA_DEFAULT_SOFTTIMEOUT; static int da_send_ordered = DA_DEFAULT_SEND_ORDERED; static SYSCTL_NODE(_kern_cam, OID_AUTO, da, CTLFLAG_RD, 0, "CAM Direct Access Disk driver"); SYSCTL_INT(_kern_cam_da, OID_AUTO, poll_period, CTLFLAG_RWTUN, &da_poll_period, 0, "Media polling period in seconds"); SYSCTL_INT(_kern_cam_da, OID_AUTO, retry_count, CTLFLAG_RWTUN, &da_retry_count, 0, "Normal I/O retry count"); SYSCTL_INT(_kern_cam_da, OID_AUTO, default_timeout, CTLFLAG_RWTUN, &da_default_timeout, 0, "Normal I/O timeout (in seconds)"); SYSCTL_INT(_kern_cam_da, OID_AUTO, send_ordered, CTLFLAG_RWTUN, &da_send_ordered, 0, "Send Ordered Tags"); SYSCTL_PROC(_kern_cam_da, OID_AUTO, default_softtimeout, CTLTYPE_UINT | CTLFLAG_RW, NULL, 0, dasysctlsofttimeout, "I", "Soft I/O timeout (ms)"); TUNABLE_INT64("kern.cam.da.default_softtimeout", &da_default_softtimeout); /* * DA_ORDEREDTAG_INTERVAL determines how often, relative * to the default timeout, we check to see whether an ordered * tagged transaction is appropriate to prevent simple tag * starvation. Since we'd like to ensure that there is at least * 1/2 of the timeout length left for a starved transaction to * complete after we've sent an ordered tag, we must poll at least * four times in every timeout period. This takes care of the worst * case where a starved transaction starts during an interval that * meets the requirement "don't send an ordered tag" test so it takes * us two intervals to determine that a tag must be sent. */ #ifndef DA_ORDEREDTAG_INTERVAL #define DA_ORDEREDTAG_INTERVAL 4 #endif static struct periph_driver dadriver = { dainit, "da", TAILQ_HEAD_INITIALIZER(dadriver.units), /* generation */ 0 }; PERIPHDRIVER_DECLARE(da, dadriver); static MALLOC_DEFINE(M_SCSIDA, "scsi_da", "scsi_da buffers"); /* * This driver takes out references / holds in well defined pairs, never * recursively. These macros / inline functions enforce those rules. They * are only enabled with DA_TRACK_REFS or INVARIANTS. If DA_TRACK_REFS is * defined to be 2 or larger, the tracking also includes debug printfs. */ #if defined(DA_TRACK_REFS) || defined(INVARIANTS) #ifndef DA_TRACK_REFS #define DA_TRACK_REFS 1 #endif #if DA_TRACK_REFS > 1 static const char *da_ref_text[] = { "bogus", "open", "open hold", "close hold", "reprobe hold", "Test Unit Ready", "Geom", "sysctl", "reprobe", "max -- also bogus" }; #define DA_PERIPH_PRINT(periph, msg, args...) \ CAM_PERIPH_PRINT(periph, msg, ##args) #else #define DA_PERIPH_PRINT(periph, msg, args...) #endif static inline void token_sanity(da_ref_token token) { if ((unsigned)token >= DA_REF_MAX) panic("Bad token value passed in %d\n", token); } static inline int da_periph_hold(struct cam_periph *periph, int priority, da_ref_token token) { int err = cam_periph_hold(periph, priority); token_sanity(token); DA_PERIPH_PRINT(periph, "Holding device %s (%d): %d\n", da_ref_text[token], token, err); if (err == 0) { int cnt; struct da_softc *softc = periph->softc; cnt = atomic_fetchadd_int(&softc->ref_flags[token], 1); if (cnt != 0) panic("Re-holding for reason %d, cnt = %d", token, cnt); } return (err); } static inline void da_periph_unhold(struct cam_periph *periph, da_ref_token token) { int cnt; struct da_softc *softc = periph->softc; token_sanity(token); DA_PERIPH_PRINT(periph, "Unholding device %s (%d)\n", da_ref_text[token], token); cnt = atomic_fetchadd_int(&softc->ref_flags[token], -1); if (cnt != 1) panic("Unholding %d with cnt = %d", token, cnt); cam_periph_unhold(periph); } static inline int da_periph_acquire(struct cam_periph *periph, da_ref_token token) { int err = cam_periph_acquire(periph); token_sanity(token); DA_PERIPH_PRINT(periph, "acquiring device %s (%d): %d\n", da_ref_text[token], token, err); if (err == 0) { int cnt; struct da_softc *softc = periph->softc; cnt = atomic_fetchadd_int(&softc->ref_flags[token], 1); if (cnt != 0) panic("Re-refing for reason %d, cnt = %d", token, cnt); } return (err); } static inline void da_periph_release(struct cam_periph *periph, da_ref_token token) { int cnt; struct da_softc *softc = periph->softc; token_sanity(token); DA_PERIPH_PRINT(periph, "releasing device %s (%d)\n", da_ref_text[token], token); cnt = atomic_fetchadd_int(&softc->ref_flags[token], -1); if (cnt != 1) panic("Releasing %d with cnt = %d", token, cnt); cam_periph_release(periph); } static inline void da_periph_release_locked(struct cam_periph *periph, da_ref_token token) { int cnt; struct da_softc *softc = periph->softc; token_sanity(token); DA_PERIPH_PRINT(periph, "releasing device (locked) %s (%d)\n", da_ref_text[token], token); cnt = atomic_fetchadd_int(&softc->ref_flags[token], -1); if (cnt != 1) panic("Unholding %d with cnt = %d", token, cnt); cam_periph_release_locked(periph); } #define cam_periph_hold POISON #define cam_periph_unhold POISON #define cam_periph_acquire POISON #define cam_periph_release POISON #define cam_periph_release_locked POISON #else #define da_periph_hold(periph, prio, token) cam_periph_hold((periph), (prio)) #define da_periph_unhold(periph, token) cam_periph_unhold((periph)) #define da_periph_acquire(periph, token) cam_periph_acquire((periph)) #define da_periph_release(periph, token) cam_periph_release((periph)) #define da_periph_release_locked(periph, token) cam_periph_release_locked((periph)) #endif static int daopen(struct disk *dp) { struct cam_periph *periph; struct da_softc *softc; int error; periph = (struct cam_periph *)dp->d_drv1; if (da_periph_acquire(periph, DA_REF_OPEN) != 0) { return (ENXIO); } cam_periph_lock(periph); if ((error = da_periph_hold(periph, PRIBIO|PCATCH, DA_REF_OPEN_HOLD)) != 0) { cam_periph_unlock(periph); da_periph_release(periph, DA_REF_OPEN); return (error); } CAM_DEBUG(periph->path, CAM_DEBUG_TRACE | CAM_DEBUG_PERIPH, ("daopen\n")); softc = (struct da_softc *)periph->softc; dareprobe(periph); /* Wait for the disk size update. */ error = cam_periph_sleep(periph, &softc->disk->d_mediasize, PRIBIO, "dareprobe", 0); if (error != 0) xpt_print(periph->path, "unable to retrieve capacity data\n"); if (periph->flags & CAM_PERIPH_INVALID) error = ENXIO; if (error == 0 && (softc->flags & DA_FLAG_PACK_REMOVABLE) != 0 && (softc->quirks & DA_Q_NO_PREVENT) == 0) daprevent(periph, PR_PREVENT); if (error == 0) { softc->flags &= ~DA_FLAG_PACK_INVALID; softc->flags |= DA_FLAG_OPEN; } da_periph_unhold(periph, DA_REF_OPEN_HOLD); cam_periph_unlock(periph); if (error != 0) da_periph_release(periph, DA_REF_OPEN); return (error); } static int daclose(struct disk *dp) { struct cam_periph *periph; struct da_softc *softc; union ccb *ccb; periph = (struct cam_periph *)dp->d_drv1; softc = (struct da_softc *)periph->softc; cam_periph_lock(periph); CAM_DEBUG(periph->path, CAM_DEBUG_TRACE | CAM_DEBUG_PERIPH, ("daclose\n")); if (da_periph_hold(periph, PRIBIO, DA_REF_CLOSE_HOLD) == 0) { /* Flush disk cache. */ if ((softc->flags & DA_FLAG_DIRTY) != 0 && (softc->quirks & DA_Q_NO_SYNC_CACHE) == 0 && (softc->flags & DA_FLAG_PACK_INVALID) == 0) { ccb = cam_periph_getccb(periph, CAM_PRIORITY_NORMAL); scsi_synchronize_cache(&ccb->csio, /*retries*/1, /*cbfcnp*/dadone, MSG_SIMPLE_Q_TAG, /*begin_lba*/0, /*lb_count*/0, SSD_FULL_SIZE, 5 * 60 * 1000); cam_periph_runccb(ccb, daerror, /*cam_flags*/0, /*sense_flags*/SF_RETRY_UA | SF_QUIET_IR, softc->disk->d_devstat); softc->flags &= ~DA_FLAG_DIRTY; xpt_release_ccb(ccb); } /* Allow medium removal. */ if ((softc->flags & DA_FLAG_PACK_REMOVABLE) != 0 && (softc->quirks & DA_Q_NO_PREVENT) == 0) daprevent(periph, PR_ALLOW); da_periph_unhold(periph, DA_REF_CLOSE_HOLD); } /* * If we've got removeable media, mark the blocksize as * unavailable, since it could change when new media is * inserted. */ if ((softc->flags & DA_FLAG_PACK_REMOVABLE) != 0) softc->disk->d_devstat->flags |= DEVSTAT_BS_UNAVAILABLE; softc->flags &= ~DA_FLAG_OPEN; while (softc->refcount != 0) cam_periph_sleep(periph, &softc->refcount, PRIBIO, "daclose", 1); cam_periph_unlock(periph); da_periph_release(periph, DA_REF_OPEN); return (0); } static void daschedule(struct cam_periph *periph) { struct da_softc *softc = (struct da_softc *)periph->softc; if (softc->state != DA_STATE_NORMAL) return; cam_iosched_schedule(softc->cam_iosched, periph); } /* * Actually translate the requested transfer into one the physical driver * can understand. The transfer is described by a buf and will include * only one physical transfer. */ static void dastrategy(struct bio *bp) { struct cam_periph *periph; struct da_softc *softc; periph = (struct cam_periph *)bp->bio_disk->d_drv1; softc = (struct da_softc *)periph->softc; cam_periph_lock(periph); /* * If the device has been made invalid, error out */ if ((softc->flags & DA_FLAG_PACK_INVALID)) { cam_periph_unlock(periph); biofinish(bp, NULL, ENXIO); return; } CAM_DEBUG(periph->path, CAM_DEBUG_TRACE, ("dastrategy(%p)\n", bp)); /* * Zone commands must be ordered, because they can depend on the * effects of previously issued commands, and they may affect * commands after them. */ if (bp->bio_cmd == BIO_ZONE) bp->bio_flags |= BIO_ORDERED; /* * Place it in the queue of disk activities for this disk */ cam_iosched_queue_work(softc->cam_iosched, bp); /* * Schedule ourselves for performing the work. */ daschedule(periph); cam_periph_unlock(periph); return; } static int dadump(void *arg, void *virtual, vm_offset_t physical, off_t offset, size_t length) { struct cam_periph *periph; struct da_softc *softc; u_int secsize; struct ccb_scsiio csio; struct disk *dp; int error = 0; dp = arg; periph = dp->d_drv1; softc = (struct da_softc *)periph->softc; secsize = softc->params.secsize; if ((softc->flags & DA_FLAG_PACK_INVALID) != 0) return (ENXIO); memset(&csio, 0, sizeof(csio)); if (length > 0) { xpt_setup_ccb(&csio.ccb_h, periph->path, CAM_PRIORITY_NORMAL); csio.ccb_h.ccb_state = DA_CCB_DUMP; scsi_read_write(&csio, /*retries*/0, dadone, MSG_ORDERED_Q_TAG, /*read*/SCSI_RW_WRITE, /*byte2*/0, /*minimum_cmd_size*/ softc->minimum_cmd_size, offset / secsize, length / secsize, /*data_ptr*/(u_int8_t *) virtual, /*dxfer_len*/length, /*sense_len*/SSD_FULL_SIZE, da_default_timeout * 1000); error = cam_periph_runccb((union ccb *)&csio, cam_periph_error, 0, SF_NO_RECOVERY | SF_NO_RETRY, NULL); if (error != 0) printf("Aborting dump due to I/O error.\n"); return (error); } /* * Sync the disk cache contents to the physical media. */ if ((softc->quirks & DA_Q_NO_SYNC_CACHE) == 0) { xpt_setup_ccb(&csio.ccb_h, periph->path, CAM_PRIORITY_NORMAL); csio.ccb_h.ccb_state = DA_CCB_DUMP; scsi_synchronize_cache(&csio, /*retries*/0, /*cbfcnp*/dadone, MSG_SIMPLE_Q_TAG, /*begin_lba*/0,/* Cover the whole disk */ /*lb_count*/0, SSD_FULL_SIZE, 5 * 1000); error = cam_periph_runccb((union ccb *)&csio, cam_periph_error, 0, SF_NO_RECOVERY | SF_NO_RETRY, NULL); if (error != 0) xpt_print(periph->path, "Synchronize cache failed\n"); } return (error); } static int dagetattr(struct bio *bp) { int ret; struct cam_periph *periph; periph = (struct cam_periph *)bp->bio_disk->d_drv1; cam_periph_lock(periph); ret = xpt_getattr(bp->bio_data, bp->bio_length, bp->bio_attribute, periph->path); cam_periph_unlock(periph); if (ret == 0) bp->bio_completed = bp->bio_length; return ret; } static void dainit(void) { cam_status status; /* * Install a global async callback. This callback will * receive async callbacks like "new device found". */ status = xpt_register_async(AC_FOUND_DEVICE, daasync, NULL, NULL); if (status != CAM_REQ_CMP) { printf("da: Failed to attach master async callback " "due to status 0x%x!\n", status); } else if (da_send_ordered) { /* Register our shutdown event handler */ if ((EVENTHANDLER_REGISTER(shutdown_post_sync, dashutdown, NULL, SHUTDOWN_PRI_DEFAULT)) == NULL) printf("dainit: shutdown event registration failed!\n"); } } /* * Callback from GEOM, called when it has finished cleaning up its * resources. */ static void dadiskgonecb(struct disk *dp) { struct cam_periph *periph; periph = (struct cam_periph *)dp->d_drv1; da_periph_release(periph, DA_REF_GEOM); } static void daoninvalidate(struct cam_periph *periph) { struct da_softc *softc; softc = (struct da_softc *)periph->softc; /* * De-register any async callbacks. */ xpt_register_async(0, daasync, periph, periph->path); softc->flags |= DA_FLAG_PACK_INVALID; #ifdef CAM_IO_STATS softc->invalidations++; #endif /* * Return all queued I/O with ENXIO. * XXX Handle any transactions queued to the card * with XPT_ABORT_CCB. */ cam_iosched_flush(softc->cam_iosched, NULL, ENXIO); /* * Tell GEOM that we've gone away, we'll get a callback when it is * done cleaning up its resources. */ disk_gone(softc->disk); } static void dacleanup(struct cam_periph *periph) { struct da_softc *softc; softc = (struct da_softc *)periph->softc; cam_periph_unlock(periph); cam_iosched_fini(softc->cam_iosched); /* * If we can't free the sysctl tree, oh well... */ if ((softc->flags & DA_FLAG_SCTX_INIT) != 0) { #ifdef CAM_IO_STATS if (sysctl_ctx_free(&softc->sysctl_stats_ctx) != 0) xpt_print(periph->path, "can't remove sysctl stats context\n"); #endif if (sysctl_ctx_free(&softc->sysctl_ctx) != 0) xpt_print(periph->path, "can't remove sysctl context\n"); } callout_drain(&softc->mediapoll_c); disk_destroy(softc->disk); callout_drain(&softc->sendordered_c); free(softc, M_DEVBUF); cam_periph_lock(periph); } static void daasync(void *callback_arg, u_int32_t code, struct cam_path *path, void *arg) { struct cam_periph *periph; struct da_softc *softc; periph = (struct cam_periph *)callback_arg; switch (code) { case AC_FOUND_DEVICE: { struct ccb_getdev *cgd; cam_status status; cgd = (struct ccb_getdev *)arg; if (cgd == NULL) break; if (cgd->protocol != PROTO_SCSI) break; if (SID_QUAL(&cgd->inq_data) != SID_QUAL_LU_CONNECTED) break; if (SID_TYPE(&cgd->inq_data) != T_DIRECT && SID_TYPE(&cgd->inq_data) != T_RBC && SID_TYPE(&cgd->inq_data) != T_OPTICAL && SID_TYPE(&cgd->inq_data) != T_ZBC_HM) break; /* * Allocate a peripheral instance for * this device and start the probe * process. */ status = cam_periph_alloc(daregister, daoninvalidate, dacleanup, dastart, "da", CAM_PERIPH_BIO, path, daasync, AC_FOUND_DEVICE, cgd); if (status != CAM_REQ_CMP && status != CAM_REQ_INPROG) printf("daasync: Unable to attach to new device " "due to status 0x%x\n", status); return; } case AC_ADVINFO_CHANGED: { uintptr_t buftype; buftype = (uintptr_t)arg; if (buftype == CDAI_TYPE_PHYS_PATH) { struct da_softc *softc; softc = periph->softc; disk_attr_changed(softc->disk, "GEOM::physpath", M_NOWAIT); } break; } case AC_UNIT_ATTENTION: { union ccb *ccb; int error_code, sense_key, asc, ascq; softc = (struct da_softc *)periph->softc; ccb = (union ccb *)arg; /* * Handle all UNIT ATTENTIONs except our own, * as they will be handled by daerror(). */ if (xpt_path_periph(ccb->ccb_h.path) != periph && scsi_extract_sense_ccb(ccb, &error_code, &sense_key, &asc, &ascq)) { if (asc == 0x2A && ascq == 0x09) { xpt_print(ccb->ccb_h.path, "Capacity data has changed\n"); softc->flags &= ~DA_FLAG_PROBED; dareprobe(periph); } else if (asc == 0x28 && ascq == 0x00) { softc->flags &= ~DA_FLAG_PROBED; disk_media_changed(softc->disk, M_NOWAIT); } else if (asc == 0x3F && ascq == 0x03) { xpt_print(ccb->ccb_h.path, "INQUIRY data has changed\n"); softc->flags &= ~DA_FLAG_PROBED; dareprobe(periph); } } break; } case AC_SCSI_AEN: softc = (struct da_softc *)periph->softc; if (!cam_iosched_has_work_flags(softc->cam_iosched, DA_WORK_TUR)) { if (da_periph_acquire(periph, DA_REF_TUR) == 0) { cam_iosched_set_work_flags(softc->cam_iosched, DA_WORK_TUR); daschedule(periph); } } /* FALLTHROUGH */ case AC_SENT_BDR: case AC_BUS_RESET: { struct ccb_hdr *ccbh; softc = (struct da_softc *)periph->softc; /* * Don't fail on the expected unit attention * that will occur. */ softc->flags |= DA_FLAG_RETRY_UA; LIST_FOREACH(ccbh, &softc->pending_ccbs, periph_links.le) ccbh->ccb_state |= DA_CCB_RETRY_UA; break; } case AC_INQ_CHANGED: softc = (struct da_softc *)periph->softc; softc->flags &= ~DA_FLAG_PROBED; dareprobe(periph); break; default: break; } cam_periph_async(periph, code, path, arg); } static void dasysctlinit(void *context, int pending) { struct cam_periph *periph; struct da_softc *softc; char tmpstr[32], tmpstr2[16]; struct ccb_trans_settings cts; periph = (struct cam_periph *)context; /* * periph was held for us when this task was enqueued */ if (periph->flags & CAM_PERIPH_INVALID) { da_periph_release(periph, DA_REF_SYSCTL); return; } softc = (struct da_softc *)periph->softc; snprintf(tmpstr, sizeof(tmpstr), "CAM DA unit %d", periph->unit_number); snprintf(tmpstr2, sizeof(tmpstr2), "%d", periph->unit_number); sysctl_ctx_init(&softc->sysctl_ctx); softc->flags |= DA_FLAG_SCTX_INIT; softc->sysctl_tree = SYSCTL_ADD_NODE_WITH_LABEL(&softc->sysctl_ctx, SYSCTL_STATIC_CHILDREN(_kern_cam_da), OID_AUTO, tmpstr2, CTLFLAG_RD, 0, tmpstr, "device_index"); if (softc->sysctl_tree == NULL) { printf("dasysctlinit: unable to allocate sysctl tree\n"); da_periph_release(periph, DA_REF_SYSCTL); return; } /* * Now register the sysctl handler, so the user can change the value on * the fly. */ SYSCTL_ADD_PROC(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree), OID_AUTO, "delete_method", CTLTYPE_STRING | CTLFLAG_RWTUN, softc, 0, dadeletemethodsysctl, "A", "BIO_DELETE execution method"); SYSCTL_ADD_PROC(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree), OID_AUTO, "delete_max", CTLTYPE_U64 | CTLFLAG_RW, softc, 0, dadeletemaxsysctl, "Q", "Maximum BIO_DELETE size"); SYSCTL_ADD_PROC(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree), OID_AUTO, "minimum_cmd_size", CTLTYPE_INT | CTLFLAG_RW, &softc->minimum_cmd_size, 0, dacmdsizesysctl, "I", "Minimum CDB size"); SYSCTL_ADD_PROC(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree), OID_AUTO, "zone_mode", CTLTYPE_STRING | CTLFLAG_RD, softc, 0, dazonemodesysctl, "A", "Zone Mode"); SYSCTL_ADD_PROC(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree), OID_AUTO, "zone_support", CTLTYPE_STRING | CTLFLAG_RD, softc, 0, dazonesupsysctl, "A", "Zone Support"); SYSCTL_ADD_UQUAD(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree), OID_AUTO, "optimal_seq_zones", CTLFLAG_RD, &softc->optimal_seq_zones, "Optimal Number of Open Sequential Write Preferred Zones"); SYSCTL_ADD_UQUAD(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree), OID_AUTO, "optimal_nonseq_zones", CTLFLAG_RD, &softc->optimal_nonseq_zones, "Optimal Number of Non-Sequentially Written Sequential Write " "Preferred Zones"); SYSCTL_ADD_UQUAD(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree), OID_AUTO, "max_seq_zones", CTLFLAG_RD, &softc->max_seq_zones, "Maximum Number of Open Sequential Write Required Zones"); SYSCTL_ADD_INT(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree), OID_AUTO, "error_inject", CTLFLAG_RW, &softc->error_inject, 0, "error_inject leaf"); SYSCTL_ADD_INT(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree), OID_AUTO, "unmapped_io", CTLFLAG_RD, &softc->unmappedio, 0, "Unmapped I/O leaf"); SYSCTL_ADD_INT(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree), OID_AUTO, "rotating", CTLFLAG_RD, &softc->rotating, 0, "Rotating media"); /* * Add some addressing info. */ memset(&cts, 0, sizeof (cts)); xpt_setup_ccb(&cts.ccb_h, periph->path, CAM_PRIORITY_NONE); cts.ccb_h.func_code = XPT_GET_TRAN_SETTINGS; cts.type = CTS_TYPE_CURRENT_SETTINGS; cam_periph_lock(periph); xpt_action((union ccb *)&cts); cam_periph_unlock(periph); if (cts.ccb_h.status != CAM_REQ_CMP) { da_periph_release(periph, DA_REF_SYSCTL); return; } if (cts.protocol == PROTO_SCSI && cts.transport == XPORT_FC) { struct ccb_trans_settings_fc *fc = &cts.xport_specific.fc; if (fc->valid & CTS_FC_VALID_WWPN) { softc->wwpn = fc->wwpn; SYSCTL_ADD_UQUAD(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree), OID_AUTO, "wwpn", CTLFLAG_RD, &softc->wwpn, "World Wide Port Name"); } } #ifdef CAM_IO_STATS /* * Now add some useful stats. * XXX These should live in cam_periph and be common to all periphs */ softc->sysctl_stats_tree = SYSCTL_ADD_NODE(&softc->sysctl_stats_ctx, SYSCTL_CHILDREN(softc->sysctl_tree), OID_AUTO, "stats", CTLFLAG_RD, 0, "Statistics"); SYSCTL_ADD_INT(&softc->sysctl_stats_ctx, SYSCTL_CHILDREN(softc->sysctl_stats_tree), OID_AUTO, "errors", CTLFLAG_RD, &softc->errors, 0, "Transport errors reported by the SIM"); SYSCTL_ADD_INT(&softc->sysctl_stats_ctx, SYSCTL_CHILDREN(softc->sysctl_stats_tree), OID_AUTO, "timeouts", CTLFLAG_RD, &softc->timeouts, 0, "Device timeouts reported by the SIM"); SYSCTL_ADD_INT(&softc->sysctl_stats_ctx, SYSCTL_CHILDREN(softc->sysctl_stats_tree), OID_AUTO, "pack_invalidations", CTLFLAG_RD, &softc->invalidations, 0, "Device pack invalidations"); #endif cam_iosched_sysctl_init(softc->cam_iosched, &softc->sysctl_ctx, softc->sysctl_tree); da_periph_release(periph, DA_REF_SYSCTL); } static int dadeletemaxsysctl(SYSCTL_HANDLER_ARGS) { int error; uint64_t value; struct da_softc *softc; softc = (struct da_softc *)arg1; value = softc->disk->d_delmaxsize; error = sysctl_handle_64(oidp, &value, 0, req); if ((error != 0) || (req->newptr == NULL)) return (error); /* only accept values smaller than the calculated value */ if (value > dadeletemaxsize(softc, softc->delete_method)) { return (EINVAL); } softc->disk->d_delmaxsize = value; return (0); } static int dacmdsizesysctl(SYSCTL_HANDLER_ARGS) { int error, value; value = *(int *)arg1; error = sysctl_handle_int(oidp, &value, 0, req); if ((error != 0) || (req->newptr == NULL)) return (error); /* * Acceptable values here are 6, 10, 12 or 16. */ if (value < 6) value = 6; else if ((value > 6) && (value <= 10)) value = 10; else if ((value > 10) && (value <= 12)) value = 12; else if (value > 12) value = 16; *(int *)arg1 = value; return (0); } static int dasysctlsofttimeout(SYSCTL_HANDLER_ARGS) { sbintime_t value; int error; value = da_default_softtimeout / SBT_1MS; error = sysctl_handle_int(oidp, (int *)&value, 0, req); if ((error != 0) || (req->newptr == NULL)) return (error); /* XXX Should clip this to a reasonable level */ if (value > da_default_timeout * 1000) return (EINVAL); da_default_softtimeout = value * SBT_1MS; return (0); } static void dadeletemethodset(struct da_softc *softc, da_delete_methods delete_method) { softc->delete_method = delete_method; softc->disk->d_delmaxsize = dadeletemaxsize(softc, delete_method); softc->delete_func = da_delete_functions[delete_method]; if (softc->delete_method > DA_DELETE_DISABLE) softc->disk->d_flags |= DISKFLAG_CANDELETE; else softc->disk->d_flags &= ~DISKFLAG_CANDELETE; } static off_t dadeletemaxsize(struct da_softc *softc, da_delete_methods delete_method) { off_t sectors; switch(delete_method) { case DA_DELETE_UNMAP: sectors = (off_t)softc->unmap_max_lba; break; case DA_DELETE_ATA_TRIM: sectors = (off_t)ATA_DSM_RANGE_MAX * softc->trim_max_ranges; break; case DA_DELETE_WS16: sectors = omin(softc->ws_max_blks, WS16_MAX_BLKS); break; case DA_DELETE_ZERO: case DA_DELETE_WS10: sectors = omin(softc->ws_max_blks, WS10_MAX_BLKS); break; default: return 0; } return (off_t)softc->params.secsize * omin(sectors, softc->params.sectors); } static void daprobedone(struct cam_periph *periph, union ccb *ccb) { struct da_softc *softc; softc = (struct da_softc *)periph->softc; dadeletemethodchoose(softc, DA_DELETE_NONE); if (bootverbose && (softc->flags & DA_FLAG_ANNOUNCED) == 0) { char buf[80]; int i, sep; snprintf(buf, sizeof(buf), "Delete methods: <"); sep = 0; for (i = 0; i <= DA_DELETE_MAX; i++) { if ((softc->delete_available & (1 << i)) == 0 && i != softc->delete_method) continue; if (sep) strlcat(buf, ",", sizeof(buf)); strlcat(buf, da_delete_method_names[i], sizeof(buf)); if (i == softc->delete_method) strlcat(buf, "(*)", sizeof(buf)); sep = 1; } strlcat(buf, ">", sizeof(buf)); printf("%s%d: %s\n", periph->periph_name, periph->unit_number, buf); } /* * Since our peripheral may be invalidated by an error * above or an external event, we must release our CCB * before releasing the probe lock on the peripheral. * The peripheral will only go away once the last lock * is removed, and we need it around for the CCB release * operation. */ xpt_release_ccb(ccb); softc->state = DA_STATE_NORMAL; softc->flags |= DA_FLAG_PROBED; daschedule(periph); wakeup(&softc->disk->d_mediasize); if ((softc->flags & DA_FLAG_ANNOUNCED) == 0) { softc->flags |= DA_FLAG_ANNOUNCED; da_periph_unhold(periph, DA_REF_PROBE_HOLD); } else da_periph_release_locked(periph, DA_REF_REPROBE); } static void dadeletemethodchoose(struct da_softc *softc, da_delete_methods default_method) { int i, methods; /* If available, prefer the method requested by user. */ i = softc->delete_method_pref; methods = softc->delete_available | (1 << DA_DELETE_DISABLE); if (methods & (1 << i)) { dadeletemethodset(softc, i); return; } /* Use the pre-defined order to choose the best performing delete. */ for (i = DA_DELETE_MIN; i <= DA_DELETE_MAX; i++) { if (i == DA_DELETE_ZERO) continue; if (softc->delete_available & (1 << i)) { dadeletemethodset(softc, i); return; } } /* Fallback to default. */ dadeletemethodset(softc, default_method); } static int dadeletemethodsysctl(SYSCTL_HANDLER_ARGS) { char buf[16]; const char *p; struct da_softc *softc; int i, error, value; softc = (struct da_softc *)arg1; value = softc->delete_method; if (value < 0 || value > DA_DELETE_MAX) p = "UNKNOWN"; else p = da_delete_method_names[value]; strncpy(buf, p, sizeof(buf)); error = sysctl_handle_string(oidp, buf, sizeof(buf), req); if (error != 0 || req->newptr == NULL) return (error); for (i = 0; i <= DA_DELETE_MAX; i++) { if (strcmp(buf, da_delete_method_names[i]) == 0) break; } if (i > DA_DELETE_MAX) return (EINVAL); softc->delete_method_pref = i; dadeletemethodchoose(softc, DA_DELETE_NONE); return (0); } static int dazonemodesysctl(SYSCTL_HANDLER_ARGS) { char tmpbuf[40]; struct da_softc *softc; int error; softc = (struct da_softc *)arg1; switch (softc->zone_mode) { case DA_ZONE_DRIVE_MANAGED: snprintf(tmpbuf, sizeof(tmpbuf), "Drive Managed"); break; case DA_ZONE_HOST_AWARE: snprintf(tmpbuf, sizeof(tmpbuf), "Host Aware"); break; case DA_ZONE_HOST_MANAGED: snprintf(tmpbuf, sizeof(tmpbuf), "Host Managed"); break; case DA_ZONE_NONE: default: snprintf(tmpbuf, sizeof(tmpbuf), "Not Zoned"); break; } error = sysctl_handle_string(oidp, tmpbuf, sizeof(tmpbuf), req); return (error); } static int dazonesupsysctl(SYSCTL_HANDLER_ARGS) { char tmpbuf[180]; struct da_softc *softc; struct sbuf sb; int error, first; unsigned int i; softc = (struct da_softc *)arg1; error = 0; first = 1; sbuf_new(&sb, tmpbuf, sizeof(tmpbuf), 0); for (i = 0; i < sizeof(da_zone_desc_table) / sizeof(da_zone_desc_table[0]); i++) { if (softc->zone_flags & da_zone_desc_table[i].value) { if (first == 0) sbuf_printf(&sb, ", "); else first = 0; sbuf_cat(&sb, da_zone_desc_table[i].desc); } } if (first == 1) sbuf_printf(&sb, "None"); sbuf_finish(&sb); error = sysctl_handle_string(oidp, sbuf_data(&sb), sbuf_len(&sb), req); return (error); } static cam_status daregister(struct cam_periph *periph, void *arg) { struct da_softc *softc; struct ccb_pathinq cpi; struct ccb_getdev *cgd; char tmpstr[80]; caddr_t match; cgd = (struct ccb_getdev *)arg; if (cgd == NULL) { printf("daregister: no getdev CCB, can't register device\n"); return(CAM_REQ_CMP_ERR); } softc = (struct da_softc *)malloc(sizeof(*softc), M_DEVBUF, M_NOWAIT|M_ZERO); if (softc == NULL) { printf("daregister: Unable to probe new device. " "Unable to allocate softc\n"); return(CAM_REQ_CMP_ERR); } - if (cam_iosched_init(&softc->cam_iosched, periph, 0) != 0) { + if (cam_iosched_init(&softc->cam_iosched, periph) != 0) { printf("daregister: Unable to probe new device. " "Unable to allocate iosched memory\n"); free(softc, M_DEVBUF); return(CAM_REQ_CMP_ERR); } LIST_INIT(&softc->pending_ccbs); softc->state = DA_STATE_PROBE_WP; bioq_init(&softc->delete_run_queue); if (SID_IS_REMOVABLE(&cgd->inq_data)) softc->flags |= DA_FLAG_PACK_REMOVABLE; softc->unmap_max_ranges = UNMAP_MAX_RANGES; softc->unmap_max_lba = UNMAP_RANGE_MAX; softc->unmap_gran = 0; softc->unmap_gran_align = 0; softc->ws_max_blks = WS16_MAX_BLKS; softc->trim_max_ranges = ATA_TRIM_MAX_RANGES; softc->rotating = 1; periph->softc = softc; /* * See if this device has any quirks. */ match = cam_quirkmatch((caddr_t)&cgd->inq_data, (caddr_t)da_quirk_table, nitems(da_quirk_table), sizeof(*da_quirk_table), scsi_inquiry_match); if (match != NULL) softc->quirks = ((struct da_quirk_entry *)match)->quirks; else softc->quirks = DA_Q_NONE; /* Check if the SIM does not want 6 byte commands */ xpt_path_inq(&cpi, periph->path); if (cpi.ccb_h.status == CAM_REQ_CMP && (cpi.hba_misc & PIM_NO_6_BYTE)) softc->quirks |= DA_Q_NO_6_BYTE; if (SID_TYPE(&cgd->inq_data) == T_ZBC_HM) softc->zone_mode = DA_ZONE_HOST_MANAGED; else if (softc->quirks & DA_Q_SMR_DM) softc->zone_mode = DA_ZONE_DRIVE_MANAGED; else softc->zone_mode = DA_ZONE_NONE; if (softc->zone_mode != DA_ZONE_NONE) { if (scsi_vpd_supported_page(periph, SVPD_ATA_INFORMATION)) { if (scsi_vpd_supported_page(periph, SVPD_ZONED_BDC)) softc->zone_interface = DA_ZONE_IF_ATA_SAT; else softc->zone_interface = DA_ZONE_IF_ATA_PASS; } else softc->zone_interface = DA_ZONE_IF_SCSI; } TASK_INIT(&softc->sysctl_task, 0, dasysctlinit, periph); /* * Take an exclusive refcount on the periph while dastart is called * to finish the probe. The reference will be dropped in dadone at * the end of probe. * * XXX if cam_periph_hold returns an error, we don't hold a refcount. */ (void)da_periph_hold(periph, PRIBIO, DA_REF_PROBE_HOLD); /* * Schedule a periodic event to occasionally send an * ordered tag to a device. */ callout_init_mtx(&softc->sendordered_c, cam_periph_mtx(periph), 0); callout_reset(&softc->sendordered_c, (da_default_timeout * hz) / DA_ORDEREDTAG_INTERVAL, dasendorderedtag, softc); cam_periph_unlock(periph); /* * RBC devices don't have to support READ(6), only READ(10). */ if (softc->quirks & DA_Q_NO_6_BYTE || SID_TYPE(&cgd->inq_data) == T_RBC) softc->minimum_cmd_size = 10; else softc->minimum_cmd_size = 6; /* * Load the user's default, if any. */ snprintf(tmpstr, sizeof(tmpstr), "kern.cam.da.%d.minimum_cmd_size", periph->unit_number); TUNABLE_INT_FETCH(tmpstr, &softc->minimum_cmd_size); /* * 6, 10, 12 and 16 are the currently permissible values. */ if (softc->minimum_cmd_size > 12) softc->minimum_cmd_size = 16; else if (softc->minimum_cmd_size > 10) softc->minimum_cmd_size = 12; else if (softc->minimum_cmd_size > 6) softc->minimum_cmd_size = 10; else softc->minimum_cmd_size = 6; /* Predict whether device may support READ CAPACITY(16). */ if (SID_ANSI_REV(&cgd->inq_data) >= SCSI_REV_SPC3 && (softc->quirks & DA_Q_NO_RC16) == 0) { softc->flags |= DA_FLAG_CAN_RC16; } /* * Register this media as a disk. */ softc->disk = disk_alloc(); softc->disk->d_devstat = devstat_new_entry(periph->periph_name, periph->unit_number, 0, DEVSTAT_BS_UNAVAILABLE, SID_TYPE(&cgd->inq_data) | XPORT_DEVSTAT_TYPE(cpi.transport), DEVSTAT_PRIORITY_DISK); softc->disk->d_open = daopen; softc->disk->d_close = daclose; softc->disk->d_strategy = dastrategy; softc->disk->d_dump = dadump; softc->disk->d_getattr = dagetattr; softc->disk->d_gone = dadiskgonecb; softc->disk->d_name = "da"; softc->disk->d_drv1 = periph; if (cpi.maxio == 0) softc->maxio = DFLTPHYS; /* traditional default */ else if (cpi.maxio > MAXPHYS) softc->maxio = MAXPHYS; /* for safety */ else softc->maxio = cpi.maxio; softc->disk->d_maxsize = softc->maxio; softc->disk->d_unit = periph->unit_number; softc->disk->d_flags = DISKFLAG_DIRECT_COMPLETION | DISKFLAG_CANZONE; if ((softc->quirks & DA_Q_NO_SYNC_CACHE) == 0) softc->disk->d_flags |= DISKFLAG_CANFLUSHCACHE; if ((cpi.hba_misc & PIM_UNMAPPED) != 0) { softc->unmappedio = 1; softc->disk->d_flags |= DISKFLAG_UNMAPPED_BIO; } cam_strvis(softc->disk->d_descr, cgd->inq_data.vendor, sizeof(cgd->inq_data.vendor), sizeof(softc->disk->d_descr)); strlcat(softc->disk->d_descr, " ", sizeof(softc->disk->d_descr)); cam_strvis(&softc->disk->d_descr[strlen(softc->disk->d_descr)], cgd->inq_data.product, sizeof(cgd->inq_data.product), sizeof(softc->disk->d_descr) - strlen(softc->disk->d_descr)); softc->disk->d_hba_vendor = cpi.hba_vendor; softc->disk->d_hba_device = cpi.hba_device; softc->disk->d_hba_subvendor = cpi.hba_subvendor; softc->disk->d_hba_subdevice = cpi.hba_subdevice; /* * Acquire a reference to the periph before we register with GEOM. * We'll release this reference once GEOM calls us back (via * dadiskgonecb()) telling us that our provider has been freed. */ if (da_periph_acquire(periph, DA_REF_GEOM) != 0) { xpt_print(periph->path, "%s: lost periph during " "registration!\n", __func__); cam_periph_lock(periph); return (CAM_REQ_CMP_ERR); } disk_create(softc->disk, DISK_VERSION); cam_periph_lock(periph); /* * Add async callbacks for events of interest. * I don't bother checking if this fails as, * in most cases, the system will function just * fine without them and the only alternative * would be to not attach the device on failure. */ xpt_register_async(AC_SENT_BDR | AC_BUS_RESET | AC_LOST_DEVICE | AC_ADVINFO_CHANGED | AC_SCSI_AEN | AC_UNIT_ATTENTION | AC_INQ_CHANGED, daasync, periph, periph->path); /* * Emit an attribute changed notification just in case * physical path information arrived before our async * event handler was registered, but after anyone attaching * to our disk device polled it. */ disk_attr_changed(softc->disk, "GEOM::physpath", M_NOWAIT); /* * Schedule a periodic media polling events. */ callout_init_mtx(&softc->mediapoll_c, cam_periph_mtx(periph), 0); if ((softc->flags & DA_FLAG_PACK_REMOVABLE) && (cgd->inq_flags & SID_AEN) == 0 && da_poll_period != 0) callout_reset(&softc->mediapoll_c, da_poll_period * hz, damediapoll, periph); xpt_schedule(periph, CAM_PRIORITY_DEV); return(CAM_REQ_CMP); } static int da_zone_bio_to_scsi(int disk_zone_cmd) { switch (disk_zone_cmd) { case DISK_ZONE_OPEN: return ZBC_OUT_SA_OPEN; case DISK_ZONE_CLOSE: return ZBC_OUT_SA_CLOSE; case DISK_ZONE_FINISH: return ZBC_OUT_SA_FINISH; case DISK_ZONE_RWP: return ZBC_OUT_SA_RWP; } return -1; } static int da_zone_cmd(struct cam_periph *periph, union ccb *ccb, struct bio *bp, int *queue_ccb) { struct da_softc *softc; int error; error = 0; if (bp->bio_cmd != BIO_ZONE) { error = EINVAL; goto bailout; } softc = periph->softc; switch (bp->bio_zone.zone_cmd) { case DISK_ZONE_OPEN: case DISK_ZONE_CLOSE: case DISK_ZONE_FINISH: case DISK_ZONE_RWP: { int zone_flags; int zone_sa; uint64_t lba; zone_sa = da_zone_bio_to_scsi(bp->bio_zone.zone_cmd); if (zone_sa == -1) { xpt_print(periph->path, "Cannot translate zone " "cmd %#x to SCSI\n", bp->bio_zone.zone_cmd); error = EINVAL; goto bailout; } zone_flags = 0; lba = bp->bio_zone.zone_params.rwp.id; if (bp->bio_zone.zone_params.rwp.flags & DISK_ZONE_RWP_FLAG_ALL) zone_flags |= ZBC_OUT_ALL; if (softc->zone_interface != DA_ZONE_IF_ATA_PASS) { scsi_zbc_out(&ccb->csio, /*retries*/ da_retry_count, /*cbfcnp*/ dadone, /*tag_action*/ MSG_SIMPLE_Q_TAG, /*service_action*/ zone_sa, /*zone_id*/ lba, /*zone_flags*/ zone_flags, /*data_ptr*/ NULL, /*dxfer_len*/ 0, /*sense_len*/ SSD_FULL_SIZE, /*timeout*/ da_default_timeout * 1000); } else { /* * Note that in this case, even though we can * technically use NCQ, we don't bother for several * reasons: * 1. It hasn't been tested on a SAT layer that * supports it. This is new as of SAT-4. * 2. Even when there is a SAT layer that supports * it, that SAT layer will also probably support * ZBC -> ZAC translation, since they are both * in the SAT-4 spec. * 3. Translation will likely be preferable to ATA * passthrough. LSI / Avago at least single * steps ATA passthrough commands in the HBA, * regardless of protocol, so unless that * changes, there is a performance penalty for * doing ATA passthrough no matter whether * you're using NCQ/FPDMA, DMA or PIO. * 4. It requires a 32-byte CDB, which at least at * this point in CAM requires a CDB pointer, which * would require us to allocate an additional bit * of storage separate from the CCB. */ error = scsi_ata_zac_mgmt_out(&ccb->csio, /*retries*/ da_retry_count, /*cbfcnp*/ dadone, /*tag_action*/ MSG_SIMPLE_Q_TAG, /*use_ncq*/ 0, /*zm_action*/ zone_sa, /*zone_id*/ lba, /*zone_flags*/ zone_flags, /*data_ptr*/ NULL, /*dxfer_len*/ 0, /*cdb_storage*/ NULL, /*cdb_storage_len*/ 0, /*sense_len*/ SSD_FULL_SIZE, /*timeout*/ da_default_timeout * 1000); if (error != 0) { error = EINVAL; xpt_print(periph->path, "scsi_ata_zac_mgmt_out() returned an " "error!"); goto bailout; } } *queue_ccb = 1; break; } case DISK_ZONE_REPORT_ZONES: { uint8_t *rz_ptr; uint32_t num_entries, alloc_size; struct disk_zone_report *rep; rep = &bp->bio_zone.zone_params.report; num_entries = rep->entries_allocated; if (num_entries == 0) { xpt_print(periph->path, "No entries allocated for " "Report Zones request\n"); error = EINVAL; goto bailout; } alloc_size = sizeof(struct scsi_report_zones_hdr) + (sizeof(struct scsi_report_zones_desc) * num_entries); alloc_size = min(alloc_size, softc->disk->d_maxsize); rz_ptr = malloc(alloc_size, M_SCSIDA, M_NOWAIT | M_ZERO); if (rz_ptr == NULL) { xpt_print(periph->path, "Unable to allocate memory " "for Report Zones request\n"); error = ENOMEM; goto bailout; } if (softc->zone_interface != DA_ZONE_IF_ATA_PASS) { scsi_zbc_in(&ccb->csio, /*retries*/ da_retry_count, /*cbcfnp*/ dadone, /*tag_action*/ MSG_SIMPLE_Q_TAG, /*service_action*/ ZBC_IN_SA_REPORT_ZONES, /*zone_start_lba*/ rep->starting_id, /*zone_options*/ rep->rep_options, /*data_ptr*/ rz_ptr, /*dxfer_len*/ alloc_size, /*sense_len*/ SSD_FULL_SIZE, /*timeout*/ da_default_timeout * 1000); } else { /* * Note that in this case, even though we can * technically use NCQ, we don't bother for several * reasons: * 1. It hasn't been tested on a SAT layer that * supports it. This is new as of SAT-4. * 2. Even when there is a SAT layer that supports * it, that SAT layer will also probably support * ZBC -> ZAC translation, since they are both * in the SAT-4 spec. * 3. Translation will likely be preferable to ATA * passthrough. LSI / Avago at least single * steps ATA passthrough commands in the HBA, * regardless of protocol, so unless that * changes, there is a performance penalty for * doing ATA passthrough no matter whether * you're using NCQ/FPDMA, DMA or PIO. * 4. It requires a 32-byte CDB, which at least at * this point in CAM requires a CDB pointer, which * would require us to allocate an additional bit * of storage separate from the CCB. */ error = scsi_ata_zac_mgmt_in(&ccb->csio, /*retries*/ da_retry_count, /*cbcfnp*/ dadone, /*tag_action*/ MSG_SIMPLE_Q_TAG, /*use_ncq*/ 0, /*zm_action*/ ATA_ZM_REPORT_ZONES, /*zone_id*/ rep->starting_id, /*zone_flags*/ rep->rep_options, /*data_ptr*/ rz_ptr, /*dxfer_len*/ alloc_size, /*cdb_storage*/ NULL, /*cdb_storage_len*/ 0, /*sense_len*/ SSD_FULL_SIZE, /*timeout*/ da_default_timeout * 1000); if (error != 0) { error = EINVAL; xpt_print(periph->path, "scsi_ata_zac_mgmt_in() returned an " "error!"); goto bailout; } } /* * For BIO_ZONE, this isn't normally needed. However, it * is used by devstat_end_transaction_bio() to determine * how much data was transferred. */ /* * XXX KDM we have a problem. But I'm not sure how to fix * it. devstat uses bio_bcount - bio_resid to calculate * the amount of data transferred. The GEOM disk code * uses bio_length - bio_resid to calculate the amount of * data in bio_completed. We have different structure * sizes above and below the ada(4) driver. So, if we * use the sizes above, the amount transferred won't be * quite accurate for devstat. If we use different sizes * for bio_bcount and bio_length (above and below * respectively), then the residual needs to match one or * the other. Everything is calculated after the bio * leaves the driver, so changing the values around isn't * really an option. For now, just set the count to the * passed in length. This means that the calculations * above (e.g. bio_completed) will be correct, but the * amount of data reported to devstat will be slightly * under or overstated. */ bp->bio_bcount = bp->bio_length; *queue_ccb = 1; break; } case DISK_ZONE_GET_PARAMS: { struct disk_zone_disk_params *params; params = &bp->bio_zone.zone_params.disk_params; bzero(params, sizeof(*params)); switch (softc->zone_mode) { case DA_ZONE_DRIVE_MANAGED: params->zone_mode = DISK_ZONE_MODE_DRIVE_MANAGED; break; case DA_ZONE_HOST_AWARE: params->zone_mode = DISK_ZONE_MODE_HOST_AWARE; break; case DA_ZONE_HOST_MANAGED: params->zone_mode = DISK_ZONE_MODE_HOST_MANAGED; break; default: case DA_ZONE_NONE: params->zone_mode = DISK_ZONE_MODE_NONE; break; } if (softc->zone_flags & DA_ZONE_FLAG_URSWRZ) params->flags |= DISK_ZONE_DISK_URSWRZ; if (softc->zone_flags & DA_ZONE_FLAG_OPT_SEQ_SET) { params->optimal_seq_zones = softc->optimal_seq_zones; params->flags |= DISK_ZONE_OPT_SEQ_SET; } if (softc->zone_flags & DA_ZONE_FLAG_OPT_NONSEQ_SET) { params->optimal_nonseq_zones = softc->optimal_nonseq_zones; params->flags |= DISK_ZONE_OPT_NONSEQ_SET; } if (softc->zone_flags & DA_ZONE_FLAG_MAX_SEQ_SET) { params->max_seq_zones = softc->max_seq_zones; params->flags |= DISK_ZONE_MAX_SEQ_SET; } if (softc->zone_flags & DA_ZONE_FLAG_RZ_SUP) params->flags |= DISK_ZONE_RZ_SUP; if (softc->zone_flags & DA_ZONE_FLAG_OPEN_SUP) params->flags |= DISK_ZONE_OPEN_SUP; if (softc->zone_flags & DA_ZONE_FLAG_CLOSE_SUP) params->flags |= DISK_ZONE_CLOSE_SUP; if (softc->zone_flags & DA_ZONE_FLAG_FINISH_SUP) params->flags |= DISK_ZONE_FINISH_SUP; if (softc->zone_flags & DA_ZONE_FLAG_RWP_SUP) params->flags |= DISK_ZONE_RWP_SUP; break; } default: break; } bailout: return (error); } static void dastart(struct cam_periph *periph, union ccb *start_ccb) { struct da_softc *softc; softc = (struct da_softc *)periph->softc; CAM_DEBUG(periph->path, CAM_DEBUG_TRACE, ("dastart\n")); skipstate: switch (softc->state) { case DA_STATE_NORMAL: { struct bio *bp; uint8_t tag_code; more: bp = cam_iosched_next_bio(softc->cam_iosched); if (bp == NULL) { if (cam_iosched_has_work_flags(softc->cam_iosched, DA_WORK_TUR)) { cam_iosched_clr_work_flags(softc->cam_iosched, DA_WORK_TUR); scsi_test_unit_ready(&start_ccb->csio, /*retries*/ da_retry_count, dadone, MSG_SIMPLE_Q_TAG, SSD_FULL_SIZE, da_default_timeout * 1000); start_ccb->ccb_h.ccb_bp = NULL; start_ccb->ccb_h.ccb_state = DA_CCB_TUR; xpt_action(start_ccb); } else xpt_release_ccb(start_ccb); break; } if (bp->bio_cmd == BIO_DELETE) { if (softc->delete_func != NULL) { softc->delete_func(periph, start_ccb, bp); goto out; } else { /* Not sure this is possible, but failsafe by lying and saying "sure, done." */ biofinish(bp, NULL, 0); goto more; } } if (cam_iosched_has_work_flags(softc->cam_iosched, DA_WORK_TUR)) { cam_iosched_clr_work_flags(softc->cam_iosched, DA_WORK_TUR); da_periph_release_locked(periph, DA_REF_TUR); } if ((bp->bio_flags & BIO_ORDERED) != 0 || (softc->flags & DA_FLAG_NEED_OTAG) != 0) { softc->flags &= ~DA_FLAG_NEED_OTAG; softc->flags |= DA_FLAG_WAS_OTAG; tag_code = MSG_ORDERED_Q_TAG; } else { tag_code = MSG_SIMPLE_Q_TAG; } switch (bp->bio_cmd) { case BIO_WRITE: case BIO_READ: { void *data_ptr; int rw_op; biotrack(bp, __func__); if (bp->bio_cmd == BIO_WRITE) { softc->flags |= DA_FLAG_DIRTY; rw_op = SCSI_RW_WRITE; } else { rw_op = SCSI_RW_READ; } data_ptr = bp->bio_data; if ((bp->bio_flags & (BIO_UNMAPPED|BIO_VLIST)) != 0) { rw_op |= SCSI_RW_BIO; data_ptr = bp; } scsi_read_write(&start_ccb->csio, /*retries*/da_retry_count, /*cbfcnp*/dadone, /*tag_action*/tag_code, rw_op, /*byte2*/0, softc->minimum_cmd_size, /*lba*/bp->bio_pblkno, /*block_count*/bp->bio_bcount / softc->params.secsize, data_ptr, /*dxfer_len*/ bp->bio_bcount, /*sense_len*/SSD_FULL_SIZE, da_default_timeout * 1000); #if defined(BUF_TRACKING) || defined(FULL_BUF_TRACKING) start_ccb->csio.bio = bp; #endif break; } case BIO_FLUSH: /* * If we don't support sync cache, or the disk * isn't dirty, FLUSH is a no-op. Use the * allocated * CCB for the next bio if one is * available. */ if ((softc->quirks & DA_Q_NO_SYNC_CACHE) != 0 || (softc->flags & DA_FLAG_DIRTY) == 0) { biodone(bp); goto skipstate; } /* * BIO_FLUSH doesn't currently communicate * range data, so we synchronize the cache * over the whole disk. We also force * ordered tag semantics the flush applies * to all previously queued I/O. */ scsi_synchronize_cache(&start_ccb->csio, /*retries*/1, /*cbfcnp*/dadone, MSG_ORDERED_Q_TAG, /*begin_lba*/0, /*lb_count*/0, SSD_FULL_SIZE, da_default_timeout*1000); /* * Clear the dirty flag before sending the command. * Either this sync cache will be successful, or it * will fail after a retry. If it fails, it is * unlikely to be successful if retried later, so * we'll save ourselves time by just marking the * device clean. */ softc->flags &= ~DA_FLAG_DIRTY; break; case BIO_ZONE: { int error, queue_ccb; queue_ccb = 0; error = da_zone_cmd(periph, start_ccb, bp,&queue_ccb); if ((error != 0) || (queue_ccb == 0)) { biofinish(bp, NULL, error); xpt_release_ccb(start_ccb); return; } break; } } start_ccb->ccb_h.ccb_state = DA_CCB_BUFFER_IO; start_ccb->ccb_h.flags |= CAM_UNLOCKED; start_ccb->ccb_h.softtimeout = sbttotv(da_default_softtimeout); out: LIST_INSERT_HEAD(&softc->pending_ccbs, &start_ccb->ccb_h, periph_links.le); /* We expect a unit attention from this device */ if ((softc->flags & DA_FLAG_RETRY_UA) != 0) { start_ccb->ccb_h.ccb_state |= DA_CCB_RETRY_UA; softc->flags &= ~DA_FLAG_RETRY_UA; } start_ccb->ccb_h.ccb_bp = bp; softc->refcount++; cam_periph_unlock(periph); xpt_action(start_ccb); cam_periph_lock(periph); softc->refcount--; /* May have more work to do, so ensure we stay scheduled */ daschedule(periph); break; } case DA_STATE_PROBE_WP: { void *mode_buf; int mode_buf_len; mode_buf_len = 192; mode_buf = malloc(mode_buf_len, M_SCSIDA, M_NOWAIT); if (mode_buf == NULL) { xpt_print(periph->path, "Unable to send mode sense - " "malloc failure\n"); softc->state = DA_STATE_PROBE_RC; goto skipstate; } scsi_mode_sense_len(&start_ccb->csio, /*retries*/ da_retry_count, /*cbfcnp*/ dadone, /*tag_action*/ MSG_SIMPLE_Q_TAG, /*dbd*/ FALSE, /*pc*/ SMS_PAGE_CTRL_CURRENT, /*page*/ SMS_ALL_PAGES_PAGE, /*param_buf*/ mode_buf, /*param_len*/ mode_buf_len, /*minimum_cmd_size*/ softc->minimum_cmd_size, /*sense_len*/ SSD_FULL_SIZE, /*timeout*/ da_default_timeout * 1000); start_ccb->ccb_h.ccb_bp = NULL; start_ccb->ccb_h.ccb_state = DA_CCB_PROBE_WP; xpt_action(start_ccb); break; } case DA_STATE_PROBE_RC: { struct scsi_read_capacity_data *rcap; rcap = (struct scsi_read_capacity_data *) malloc(sizeof(*rcap), M_SCSIDA, M_NOWAIT|M_ZERO); if (rcap == NULL) { printf("dastart: Couldn't malloc read_capacity data\n"); /* da_free_periph??? */ break; } scsi_read_capacity(&start_ccb->csio, /*retries*/da_retry_count, dadone, MSG_SIMPLE_Q_TAG, rcap, SSD_FULL_SIZE, /*timeout*/5000); start_ccb->ccb_h.ccb_bp = NULL; start_ccb->ccb_h.ccb_state = DA_CCB_PROBE_RC; xpt_action(start_ccb); break; } case DA_STATE_PROBE_RC16: { struct scsi_read_capacity_data_long *rcaplong; rcaplong = (struct scsi_read_capacity_data_long *) malloc(sizeof(*rcaplong), M_SCSIDA, M_NOWAIT|M_ZERO); if (rcaplong == NULL) { printf("dastart: Couldn't malloc read_capacity data\n"); /* da_free_periph??? */ break; } scsi_read_capacity_16(&start_ccb->csio, /*retries*/ da_retry_count, /*cbfcnp*/ dadone, /*tag_action*/ MSG_SIMPLE_Q_TAG, /*lba*/ 0, /*reladr*/ 0, /*pmi*/ 0, /*rcap_buf*/ (uint8_t *)rcaplong, /*rcap_buf_len*/ sizeof(*rcaplong), /*sense_len*/ SSD_FULL_SIZE, /*timeout*/ da_default_timeout * 1000); start_ccb->ccb_h.ccb_bp = NULL; start_ccb->ccb_h.ccb_state = DA_CCB_PROBE_RC16; xpt_action(start_ccb); break; } case DA_STATE_PROBE_LBP: { struct scsi_vpd_logical_block_prov *lbp; if (!scsi_vpd_supported_page(periph, SVPD_LBP)) { /* * If we get here we don't support any SBC-3 delete * methods with UNMAP as the Logical Block Provisioning * VPD page support is required for devices which * support it according to T10/1799-D Revision 31 * however older revisions of the spec don't mandate * this so we currently don't remove these methods * from the available set. */ softc->state = DA_STATE_PROBE_BLK_LIMITS; goto skipstate; } lbp = (struct scsi_vpd_logical_block_prov *) malloc(sizeof(*lbp), M_SCSIDA, M_NOWAIT|M_ZERO); if (lbp == NULL) { printf("dastart: Couldn't malloc lbp data\n"); /* da_free_periph??? */ break; } scsi_inquiry(&start_ccb->csio, /*retries*/da_retry_count, /*cbfcnp*/dadone, /*tag_action*/MSG_SIMPLE_Q_TAG, /*inq_buf*/(u_int8_t *)lbp, /*inq_len*/sizeof(*lbp), /*evpd*/TRUE, /*page_code*/SVPD_LBP, /*sense_len*/SSD_MIN_SIZE, /*timeout*/da_default_timeout * 1000); start_ccb->ccb_h.ccb_bp = NULL; start_ccb->ccb_h.ccb_state = DA_CCB_PROBE_LBP; xpt_action(start_ccb); break; } case DA_STATE_PROBE_BLK_LIMITS: { struct scsi_vpd_block_limits *block_limits; if (!scsi_vpd_supported_page(periph, SVPD_BLOCK_LIMITS)) { /* Not supported skip to next probe */ softc->state = DA_STATE_PROBE_BDC; goto skipstate; } block_limits = (struct scsi_vpd_block_limits *) malloc(sizeof(*block_limits), M_SCSIDA, M_NOWAIT|M_ZERO); if (block_limits == NULL) { printf("dastart: Couldn't malloc block_limits data\n"); /* da_free_periph??? */ break; } scsi_inquiry(&start_ccb->csio, /*retries*/da_retry_count, /*cbfcnp*/dadone, /*tag_action*/MSG_SIMPLE_Q_TAG, /*inq_buf*/(u_int8_t *)block_limits, /*inq_len*/sizeof(*block_limits), /*evpd*/TRUE, /*page_code*/SVPD_BLOCK_LIMITS, /*sense_len*/SSD_MIN_SIZE, /*timeout*/da_default_timeout * 1000); start_ccb->ccb_h.ccb_bp = NULL; start_ccb->ccb_h.ccb_state = DA_CCB_PROBE_BLK_LIMITS; xpt_action(start_ccb); break; } case DA_STATE_PROBE_BDC: { struct scsi_vpd_block_characteristics *bdc; if (!scsi_vpd_supported_page(periph, SVPD_BDC)) { softc->state = DA_STATE_PROBE_ATA; goto skipstate; } bdc = (struct scsi_vpd_block_characteristics *) malloc(sizeof(*bdc), M_SCSIDA, M_NOWAIT|M_ZERO); if (bdc == NULL) { printf("dastart: Couldn't malloc bdc data\n"); /* da_free_periph??? */ break; } scsi_inquiry(&start_ccb->csio, /*retries*/da_retry_count, /*cbfcnp*/dadone, /*tag_action*/MSG_SIMPLE_Q_TAG, /*inq_buf*/(u_int8_t *)bdc, /*inq_len*/sizeof(*bdc), /*evpd*/TRUE, /*page_code*/SVPD_BDC, /*sense_len*/SSD_MIN_SIZE, /*timeout*/da_default_timeout * 1000); start_ccb->ccb_h.ccb_bp = NULL; start_ccb->ccb_h.ccb_state = DA_CCB_PROBE_BDC; xpt_action(start_ccb); break; } case DA_STATE_PROBE_ATA: { struct ata_params *ata_params; if (!scsi_vpd_supported_page(periph, SVPD_ATA_INFORMATION)) { if ((softc->zone_mode == DA_ZONE_HOST_AWARE) || (softc->zone_mode == DA_ZONE_HOST_MANAGED)) { /* * Note that if the ATA VPD page isn't * supported, we aren't talking to an ATA * device anyway. Support for that VPD * page is mandatory for SCSI to ATA (SAT) * translation layers. */ softc->state = DA_STATE_PROBE_ZONE; goto skipstate; } daprobedone(periph, start_ccb); break; } ata_params = (struct ata_params*) malloc(sizeof(*ata_params), M_SCSIDA,M_NOWAIT|M_ZERO); if (ata_params == NULL) { xpt_print(periph->path, "Couldn't malloc ata_params " "data\n"); /* da_free_periph??? */ break; } scsi_ata_identify(&start_ccb->csio, /*retries*/da_retry_count, /*cbfcnp*/dadone, /*tag_action*/MSG_SIMPLE_Q_TAG, /*data_ptr*/(u_int8_t *)ata_params, /*dxfer_len*/sizeof(*ata_params), /*sense_len*/SSD_FULL_SIZE, /*timeout*/da_default_timeout * 1000); start_ccb->ccb_h.ccb_bp = NULL; start_ccb->ccb_h.ccb_state = DA_CCB_PROBE_ATA; xpt_action(start_ccb); break; } case DA_STATE_PROBE_ATA_LOGDIR: { struct ata_gp_log_dir *log_dir; int retval; retval = 0; if ((softc->flags & DA_FLAG_CAN_ATA_LOG) == 0) { /* * If we don't have log support, not much point in * trying to probe zone support. */ daprobedone(periph, start_ccb); break; } /* * If we have an ATA device (the SCSI ATA Information VPD * page should be present and the ATA identify should have * succeeded) and it supports logs, ask for the log directory. */ log_dir = malloc(sizeof(*log_dir), M_SCSIDA, M_NOWAIT|M_ZERO); if (log_dir == NULL) { xpt_print(periph->path, "Couldn't malloc log_dir " "data\n"); daprobedone(periph, start_ccb); break; } retval = scsi_ata_read_log(&start_ccb->csio, /*retries*/ da_retry_count, /*cbfcnp*/ dadone, /*tag_action*/ MSG_SIMPLE_Q_TAG, /*log_address*/ ATA_LOG_DIRECTORY, /*page_number*/ 0, /*block_count*/ 1, /*protocol*/ softc->flags & DA_FLAG_CAN_ATA_DMA ? AP_PROTO_DMA : AP_PROTO_PIO_IN, /*data_ptr*/ (uint8_t *)log_dir, /*dxfer_len*/ sizeof(*log_dir), /*sense_len*/ SSD_FULL_SIZE, /*timeout*/ da_default_timeout * 1000); if (retval != 0) { xpt_print(periph->path, "scsi_ata_read_log() failed!"); free(log_dir, M_SCSIDA); daprobedone(periph, start_ccb); break; } start_ccb->ccb_h.ccb_bp = NULL; start_ccb->ccb_h.ccb_state = DA_CCB_PROBE_ATA_LOGDIR; xpt_action(start_ccb); break; } case DA_STATE_PROBE_ATA_IDDIR: { struct ata_identify_log_pages *id_dir; int retval; retval = 0; /* * Check here to see whether the Identify Device log is * supported in the directory of logs. If so, continue * with requesting the log of identify device pages. */ if ((softc->flags & DA_FLAG_CAN_ATA_IDLOG) == 0) { daprobedone(periph, start_ccb); break; } id_dir = malloc(sizeof(*id_dir), M_SCSIDA, M_NOWAIT | M_ZERO); if (id_dir == NULL) { xpt_print(periph->path, "Couldn't malloc id_dir " "data\n"); daprobedone(periph, start_ccb); break; } retval = scsi_ata_read_log(&start_ccb->csio, /*retries*/ da_retry_count, /*cbfcnp*/ dadone, /*tag_action*/ MSG_SIMPLE_Q_TAG, /*log_address*/ ATA_IDENTIFY_DATA_LOG, /*page_number*/ ATA_IDL_PAGE_LIST, /*block_count*/ 1, /*protocol*/ softc->flags & DA_FLAG_CAN_ATA_DMA ? AP_PROTO_DMA : AP_PROTO_PIO_IN, /*data_ptr*/ (uint8_t *)id_dir, /*dxfer_len*/ sizeof(*id_dir), /*sense_len*/ SSD_FULL_SIZE, /*timeout*/ da_default_timeout * 1000); if (retval != 0) { xpt_print(periph->path, "scsi_ata_read_log() failed!"); free(id_dir, M_SCSIDA); daprobedone(periph, start_ccb); break; } start_ccb->ccb_h.ccb_bp = NULL; start_ccb->ccb_h.ccb_state = DA_CCB_PROBE_ATA_IDDIR; xpt_action(start_ccb); break; } case DA_STATE_PROBE_ATA_SUP: { struct ata_identify_log_sup_cap *sup_cap; int retval; retval = 0; /* * Check here to see whether the Supported Capabilities log * is in the list of Identify Device logs. */ if ((softc->flags & DA_FLAG_CAN_ATA_SUPCAP) == 0) { daprobedone(periph, start_ccb); break; } sup_cap = malloc(sizeof(*sup_cap), M_SCSIDA, M_NOWAIT|M_ZERO); if (sup_cap == NULL) { xpt_print(periph->path, "Couldn't malloc sup_cap " "data\n"); daprobedone(periph, start_ccb); break; } retval = scsi_ata_read_log(&start_ccb->csio, /*retries*/ da_retry_count, /*cbfcnp*/ dadone, /*tag_action*/ MSG_SIMPLE_Q_TAG, /*log_address*/ ATA_IDENTIFY_DATA_LOG, /*page_number*/ ATA_IDL_SUP_CAP, /*block_count*/ 1, /*protocol*/ softc->flags & DA_FLAG_CAN_ATA_DMA ? AP_PROTO_DMA : AP_PROTO_PIO_IN, /*data_ptr*/ (uint8_t *)sup_cap, /*dxfer_len*/ sizeof(*sup_cap), /*sense_len*/ SSD_FULL_SIZE, /*timeout*/ da_default_timeout * 1000); if (retval != 0) { xpt_print(periph->path, "scsi_ata_read_log() failed!"); free(sup_cap, M_SCSIDA); daprobedone(periph, start_ccb); break; } start_ccb->ccb_h.ccb_bp = NULL; start_ccb->ccb_h.ccb_state = DA_CCB_PROBE_ATA_SUP; xpt_action(start_ccb); break; } case DA_STATE_PROBE_ATA_ZONE: { struct ata_zoned_info_log *ata_zone; int retval; retval = 0; /* * Check here to see whether the zoned device information * page is supported. If so, continue on to request it. * If not, skip to DA_STATE_PROBE_LOG or done. */ if ((softc->flags & DA_FLAG_CAN_ATA_ZONE) == 0) { daprobedone(periph, start_ccb); break; } ata_zone = malloc(sizeof(*ata_zone), M_SCSIDA, M_NOWAIT|M_ZERO); if (ata_zone == NULL) { xpt_print(periph->path, "Couldn't malloc ata_zone " "data\n"); daprobedone(periph, start_ccb); break; } retval = scsi_ata_read_log(&start_ccb->csio, /*retries*/ da_retry_count, /*cbfcnp*/ dadone, /*tag_action*/ MSG_SIMPLE_Q_TAG, /*log_address*/ ATA_IDENTIFY_DATA_LOG, /*page_number*/ ATA_IDL_ZDI, /*block_count*/ 1, /*protocol*/ softc->flags & DA_FLAG_CAN_ATA_DMA ? AP_PROTO_DMA : AP_PROTO_PIO_IN, /*data_ptr*/ (uint8_t *)ata_zone, /*dxfer_len*/ sizeof(*ata_zone), /*sense_len*/ SSD_FULL_SIZE, /*timeout*/ da_default_timeout * 1000); if (retval != 0) { xpt_print(periph->path, "scsi_ata_read_log() failed!"); free(ata_zone, M_SCSIDA); daprobedone(periph, start_ccb); break; } start_ccb->ccb_h.ccb_bp = NULL; start_ccb->ccb_h.ccb_state = DA_CCB_PROBE_ATA_ZONE; xpt_action(start_ccb); break; } case DA_STATE_PROBE_ZONE: { struct scsi_vpd_zoned_bdc *bdc; /* * Note that this page will be supported for SCSI protocol * devices that support ZBC (SMR devices), as well as ATA * protocol devices that are behind a SAT (SCSI to ATA * Translation) layer that supports converting ZBC commands * to their ZAC equivalents. */ if (!scsi_vpd_supported_page(periph, SVPD_ZONED_BDC)) { daprobedone(periph, start_ccb); break; } bdc = (struct scsi_vpd_zoned_bdc *) malloc(sizeof(*bdc), M_SCSIDA, M_NOWAIT|M_ZERO); if (bdc == NULL) { xpt_release_ccb(start_ccb); xpt_print(periph->path, "Couldn't malloc zone VPD " "data\n"); break; } scsi_inquiry(&start_ccb->csio, /*retries*/da_retry_count, /*cbfcnp*/dadone, /*tag_action*/MSG_SIMPLE_Q_TAG, /*inq_buf*/(u_int8_t *)bdc, /*inq_len*/sizeof(*bdc), /*evpd*/TRUE, /*page_code*/SVPD_ZONED_BDC, /*sense_len*/SSD_FULL_SIZE, /*timeout*/da_default_timeout * 1000); start_ccb->ccb_h.ccb_bp = NULL; start_ccb->ccb_h.ccb_state = DA_CCB_PROBE_ZONE; xpt_action(start_ccb); break; } } } /* * In each of the methods below, while its the caller's * responsibility to ensure the request will fit into a * single device request, we might have changed the delete * method due to the device incorrectly advertising either * its supported methods or limits. * * To prevent this causing further issues we validate the * against the methods limits, and warn which would * otherwise be unnecessary. */ static void da_delete_unmap(struct cam_periph *periph, union ccb *ccb, struct bio *bp) { struct da_softc *softc = (struct da_softc *)periph->softc;; struct bio *bp1; uint8_t *buf = softc->unmap_buf; struct scsi_unmap_desc *d = (void *)&buf[UNMAP_HEAD_SIZE]; uint64_t lba, lastlba = (uint64_t)-1; uint64_t totalcount = 0; uint64_t count; uint32_t c, lastcount = 0, ranges = 0; /* * Currently this doesn't take the UNMAP * Granularity and Granularity Alignment * fields into account. * * This could result in both unoptimal unmap * requests as as well as UNMAP calls unmapping * fewer LBA's than requested. */ bzero(softc->unmap_buf, sizeof(softc->unmap_buf)); bp1 = bp; do { /* * Note: ada and da are different in how they store the * pending bp's in a trim. ada stores all of them in the * trim_req.bps. da stores all but the first one in the * delete_run_queue. ada then completes all the bps in * its adadone() loop. da completes all the bps in the * delete_run_queue in dadone, and relies on the biodone * after to complete. This should be reconciled since there's * no real reason to do it differently. XXX */ if (bp1 != bp) bioq_insert_tail(&softc->delete_run_queue, bp1); lba = bp1->bio_pblkno; count = bp1->bio_bcount / softc->params.secsize; /* Try to extend the previous range. */ if (lba == lastlba) { c = omin(count, UNMAP_RANGE_MAX - lastcount); lastlba += c; lastcount += c; scsi_ulto4b(lastcount, d[ranges - 1].length); count -= c; lba += c; totalcount += c; } else if ((softc->quirks & DA_Q_STRICT_UNMAP) && softc->unmap_gran != 0) { /* Align length of the previous range. */ if ((c = lastcount % softc->unmap_gran) != 0) { if (lastcount <= c) { totalcount -= lastcount; lastlba = (uint64_t)-1; lastcount = 0; ranges--; } else { totalcount -= c; lastlba -= c; lastcount -= c; scsi_ulto4b(lastcount, d[ranges - 1].length); } } /* Align beginning of the new range. */ c = (lba - softc->unmap_gran_align) % softc->unmap_gran; if (c != 0) { c = softc->unmap_gran - c; if (count <= c) { count = 0; } else { lba += c; count -= c; } } } while (count > 0) { c = omin(count, UNMAP_RANGE_MAX); if (totalcount + c > softc->unmap_max_lba || ranges >= softc->unmap_max_ranges) { xpt_print(periph->path, "%s issuing short delete %ld > %ld" "|| %d >= %d", da_delete_method_desc[softc->delete_method], totalcount + c, softc->unmap_max_lba, ranges, softc->unmap_max_ranges); break; } scsi_u64to8b(lba, d[ranges].lba); scsi_ulto4b(c, d[ranges].length); lba += c; totalcount += c; ranges++; count -= c; lastlba = lba; lastcount = c; } bp1 = cam_iosched_next_trim(softc->cam_iosched); if (bp1 == NULL) break; if (ranges >= softc->unmap_max_ranges || totalcount + bp1->bio_bcount / softc->params.secsize > softc->unmap_max_lba) { cam_iosched_put_back_trim(softc->cam_iosched, bp1); break; } } while (1); /* Align length of the last range. */ if ((softc->quirks & DA_Q_STRICT_UNMAP) && softc->unmap_gran != 0 && (c = lastcount % softc->unmap_gran) != 0) { if (lastcount <= c) ranges--; else scsi_ulto4b(lastcount - c, d[ranges - 1].length); } scsi_ulto2b(ranges * 16 + 6, &buf[0]); scsi_ulto2b(ranges * 16, &buf[2]); scsi_unmap(&ccb->csio, /*retries*/da_retry_count, /*cbfcnp*/dadone, /*tag_action*/MSG_SIMPLE_Q_TAG, /*byte2*/0, /*data_ptr*/ buf, /*dxfer_len*/ ranges * 16 + 8, /*sense_len*/SSD_FULL_SIZE, da_default_timeout * 1000); ccb->ccb_h.ccb_state = DA_CCB_DELETE; ccb->ccb_h.flags |= CAM_UNLOCKED; cam_iosched_submit_trim(softc->cam_iosched); } static void da_delete_trim(struct cam_periph *periph, union ccb *ccb, struct bio *bp) { struct da_softc *softc = (struct da_softc *)periph->softc; struct bio *bp1; uint8_t *buf = softc->unmap_buf; uint64_t lastlba = (uint64_t)-1; uint64_t count; uint64_t lba; uint32_t lastcount = 0, c, requestcount; int ranges = 0, off, block_count; bzero(softc->unmap_buf, sizeof(softc->unmap_buf)); bp1 = bp; do { if (bp1 != bp)//XXX imp XXX bioq_insert_tail(&softc->delete_run_queue, bp1); lba = bp1->bio_pblkno; count = bp1->bio_bcount / softc->params.secsize; requestcount = count; /* Try to extend the previous range. */ if (lba == lastlba) { c = omin(count, ATA_DSM_RANGE_MAX - lastcount); lastcount += c; off = (ranges - 1) * 8; buf[off + 6] = lastcount & 0xff; buf[off + 7] = (lastcount >> 8) & 0xff; count -= c; lba += c; } while (count > 0) { c = omin(count, ATA_DSM_RANGE_MAX); off = ranges * 8; buf[off + 0] = lba & 0xff; buf[off + 1] = (lba >> 8) & 0xff; buf[off + 2] = (lba >> 16) & 0xff; buf[off + 3] = (lba >> 24) & 0xff; buf[off + 4] = (lba >> 32) & 0xff; buf[off + 5] = (lba >> 40) & 0xff; buf[off + 6] = c & 0xff; buf[off + 7] = (c >> 8) & 0xff; lba += c; ranges++; count -= c; lastcount = c; if (count != 0 && ranges == softc->trim_max_ranges) { xpt_print(periph->path, "%s issuing short delete %ld > %ld\n", da_delete_method_desc[softc->delete_method], requestcount, (softc->trim_max_ranges - ranges) * ATA_DSM_RANGE_MAX); break; } } lastlba = lba; bp1 = cam_iosched_next_trim(softc->cam_iosched); if (bp1 == NULL) break; if (bp1->bio_bcount / softc->params.secsize > (softc->trim_max_ranges - ranges) * ATA_DSM_RANGE_MAX) { cam_iosched_put_back_trim(softc->cam_iosched, bp1); break; } } while (1); block_count = howmany(ranges, ATA_DSM_BLK_RANGES); scsi_ata_trim(&ccb->csio, /*retries*/da_retry_count, /*cbfcnp*/dadone, /*tag_action*/MSG_SIMPLE_Q_TAG, block_count, /*data_ptr*/buf, /*dxfer_len*/block_count * ATA_DSM_BLK_SIZE, /*sense_len*/SSD_FULL_SIZE, da_default_timeout * 1000); ccb->ccb_h.ccb_state = DA_CCB_DELETE; ccb->ccb_h.flags |= CAM_UNLOCKED; cam_iosched_submit_trim(softc->cam_iosched); } /* * We calculate ws_max_blks here based off d_delmaxsize instead * of using softc->ws_max_blks as it is absolute max for the * device not the protocol max which may well be lower. */ static void da_delete_ws(struct cam_periph *periph, union ccb *ccb, struct bio *bp) { struct da_softc *softc; struct bio *bp1; uint64_t ws_max_blks; uint64_t lba; uint64_t count; /* forward compat with WS32 */ softc = (struct da_softc *)periph->softc; ws_max_blks = softc->disk->d_delmaxsize / softc->params.secsize; lba = bp->bio_pblkno; count = 0; bp1 = bp; do { if (bp1 != bp)//XXX imp XXX bioq_insert_tail(&softc->delete_run_queue, bp1); count += bp1->bio_bcount / softc->params.secsize; if (count > ws_max_blks) { xpt_print(periph->path, "%s issuing short delete %ld > %ld\n", da_delete_method_desc[softc->delete_method], count, ws_max_blks); count = omin(count, ws_max_blks); break; } bp1 = cam_iosched_next_trim(softc->cam_iosched); if (bp1 == NULL) break; if (lba + count != bp1->bio_pblkno || count + bp1->bio_bcount / softc->params.secsize > ws_max_blks) { cam_iosched_put_back_trim(softc->cam_iosched, bp1); break; } } while (1); scsi_write_same(&ccb->csio, /*retries*/da_retry_count, /*cbfcnp*/dadone, /*tag_action*/MSG_SIMPLE_Q_TAG, /*byte2*/softc->delete_method == DA_DELETE_ZERO ? 0 : SWS_UNMAP, softc->delete_method == DA_DELETE_WS16 ? 16 : 10, /*lba*/lba, /*block_count*/count, /*data_ptr*/ __DECONST(void *, zero_region), /*dxfer_len*/ softc->params.secsize, /*sense_len*/SSD_FULL_SIZE, da_default_timeout * 1000); ccb->ccb_h.ccb_state = DA_CCB_DELETE; ccb->ccb_h.flags |= CAM_UNLOCKED; cam_iosched_submit_trim(softc->cam_iosched); } static int cmd6workaround(union ccb *ccb) { struct scsi_rw_6 cmd6; struct scsi_rw_10 *cmd10; struct da_softc *softc; u_int8_t *cdb; struct bio *bp; int frozen; cdb = ccb->csio.cdb_io.cdb_bytes; softc = (struct da_softc *)xpt_path_periph(ccb->ccb_h.path)->softc; if (ccb->ccb_h.ccb_state == DA_CCB_DELETE) { da_delete_methods old_method = softc->delete_method; /* * Typically there are two reasons for failure here * 1. Delete method was detected as supported but isn't * 2. Delete failed due to invalid params e.g. too big * * While we will attempt to choose an alternative delete method * this may result in short deletes if the existing delete * requests from geom are big for the new method chosen. * * This method assumes that the error which triggered this * will not retry the io otherwise a panic will occur */ dadeleteflag(softc, old_method, 0); dadeletemethodchoose(softc, DA_DELETE_DISABLE); if (softc->delete_method == DA_DELETE_DISABLE) xpt_print(ccb->ccb_h.path, "%s failed, disabling BIO_DELETE\n", da_delete_method_desc[old_method]); else xpt_print(ccb->ccb_h.path, "%s failed, switching to %s BIO_DELETE\n", da_delete_method_desc[old_method], da_delete_method_desc[softc->delete_method]); while ((bp = bioq_takefirst(&softc->delete_run_queue)) != NULL) cam_iosched_queue_work(softc->cam_iosched, bp); cam_iosched_queue_work(softc->cam_iosched, (struct bio *)ccb->ccb_h.ccb_bp); ccb->ccb_h.ccb_bp = NULL; return (0); } /* Detect unsupported PREVENT ALLOW MEDIUM REMOVAL. */ if ((ccb->ccb_h.flags & CAM_CDB_POINTER) == 0 && (*cdb == PREVENT_ALLOW) && (softc->quirks & DA_Q_NO_PREVENT) == 0) { if (bootverbose) xpt_print(ccb->ccb_h.path, "PREVENT ALLOW MEDIUM REMOVAL not supported.\n"); softc->quirks |= DA_Q_NO_PREVENT; return (0); } /* Detect unsupported SYNCHRONIZE CACHE(10). */ if ((ccb->ccb_h.flags & CAM_CDB_POINTER) == 0 && (*cdb == SYNCHRONIZE_CACHE) && (softc->quirks & DA_Q_NO_SYNC_CACHE) == 0) { if (bootverbose) xpt_print(ccb->ccb_h.path, "SYNCHRONIZE CACHE(10) not supported.\n"); softc->quirks |= DA_Q_NO_SYNC_CACHE; softc->disk->d_flags &= ~DISKFLAG_CANFLUSHCACHE; return (0); } /* Translation only possible if CDB is an array and cmd is R/W6 */ if ((ccb->ccb_h.flags & CAM_CDB_POINTER) != 0 || (*cdb != READ_6 && *cdb != WRITE_6)) return 0; xpt_print(ccb->ccb_h.path, "READ(6)/WRITE(6) not supported, " "increasing minimum_cmd_size to 10.\n"); softc->minimum_cmd_size = 10; bcopy(cdb, &cmd6, sizeof(struct scsi_rw_6)); cmd10 = (struct scsi_rw_10 *)cdb; cmd10->opcode = (cmd6.opcode == READ_6) ? READ_10 : WRITE_10; cmd10->byte2 = 0; scsi_ulto4b(scsi_3btoul(cmd6.addr), cmd10->addr); cmd10->reserved = 0; scsi_ulto2b(cmd6.length, cmd10->length); cmd10->control = cmd6.control; ccb->csio.cdb_len = sizeof(*cmd10); /* Requeue request, unfreezing queue if necessary */ frozen = (ccb->ccb_h.status & CAM_DEV_QFRZN) != 0; ccb->ccb_h.status = CAM_REQUEUE_REQ; xpt_action(ccb); if (frozen) { cam_release_devq(ccb->ccb_h.path, /*relsim_flags*/0, /*reduction*/0, /*timeout*/0, /*getcount_only*/0); } return (ERESTART); } static void dazonedone(struct cam_periph *periph, union ccb *ccb) { struct da_softc *softc; struct bio *bp; softc = periph->softc; bp = (struct bio *)ccb->ccb_h.ccb_bp; switch (bp->bio_zone.zone_cmd) { case DISK_ZONE_OPEN: case DISK_ZONE_CLOSE: case DISK_ZONE_FINISH: case DISK_ZONE_RWP: break; case DISK_ZONE_REPORT_ZONES: { uint32_t avail_len; struct disk_zone_report *rep; struct scsi_report_zones_hdr *hdr; struct scsi_report_zones_desc *desc; struct disk_zone_rep_entry *entry; uint32_t hdr_len, num_avail; uint32_t num_to_fill, i; int ata; rep = &bp->bio_zone.zone_params.report; avail_len = ccb->csio.dxfer_len - ccb->csio.resid; /* * Note that bio_resid isn't normally used for zone * commands, but it is used by devstat_end_transaction_bio() * to determine how much data was transferred. Because * the size of the SCSI/ATA data structures is different * than the size of the BIO interface structures, the * amount of data actually transferred from the drive will * be different than the amount of data transferred to * the user. */ bp->bio_resid = ccb->csio.resid; hdr = (struct scsi_report_zones_hdr *)ccb->csio.data_ptr; if (avail_len < sizeof(*hdr)) { /* * Is there a better error than EIO here? We asked * for at least the header, and we got less than * that. */ bp->bio_error = EIO; bp->bio_flags |= BIO_ERROR; bp->bio_resid = bp->bio_bcount; break; } if (softc->zone_interface == DA_ZONE_IF_ATA_PASS) ata = 1; else ata = 0; hdr_len = ata ? le32dec(hdr->length) : scsi_4btoul(hdr->length); if (hdr_len > 0) rep->entries_available = hdr_len / sizeof(*desc); else rep->entries_available = 0; /* * NOTE: using the same values for the BIO version of the * same field as the SCSI/ATA values. This means we could * get some additional values that aren't defined in bio.h * if more values of the same field are defined later. */ rep->header.same = hdr->byte4 & SRZ_SAME_MASK; rep->header.maximum_lba = ata ? le64dec(hdr->maximum_lba) : scsi_8btou64(hdr->maximum_lba); /* * If the drive reports no entries that match the query, * we're done. */ if (hdr_len == 0) { rep->entries_filled = 0; break; } num_avail = min((avail_len - sizeof(*hdr)) / sizeof(*desc), hdr_len / sizeof(*desc)); /* * If the drive didn't return any data, then we're done. */ if (num_avail == 0) { rep->entries_filled = 0; break; } num_to_fill = min(num_avail, rep->entries_allocated); /* * If the user didn't allocate any entries for us to fill, * we're done. */ if (num_to_fill == 0) { rep->entries_filled = 0; break; } for (i = 0, desc = &hdr->desc_list[0], entry=&rep->entries[0]; i < num_to_fill; i++, desc++, entry++) { /* * NOTE: we're mapping the values here directly * from the SCSI/ATA bit definitions to the bio.h * definitons. There is also a warning in * disk_zone.h, but the impact is that if * additional values are added in the SCSI/ATA * specs these will be visible to consumers of * this interface. */ entry->zone_type = desc->zone_type & SRZ_TYPE_MASK; entry->zone_condition = (desc->zone_flags & SRZ_ZONE_COND_MASK) >> SRZ_ZONE_COND_SHIFT; entry->zone_flags |= desc->zone_flags & (SRZ_ZONE_NON_SEQ|SRZ_ZONE_RESET); entry->zone_length = ata ? le64dec(desc->zone_length) : scsi_8btou64(desc->zone_length); entry->zone_start_lba = ata ? le64dec(desc->zone_start_lba) : scsi_8btou64(desc->zone_start_lba); entry->write_pointer_lba = ata ? le64dec(desc->write_pointer_lba) : scsi_8btou64(desc->write_pointer_lba); } rep->entries_filled = num_to_fill; break; } case DISK_ZONE_GET_PARAMS: default: /* * In theory we should not get a GET_PARAMS bio, since it * should be handled without queueing the command to the * drive. */ panic("%s: Invalid zone command %d", __func__, bp->bio_zone.zone_cmd); break; } if (bp->bio_zone.zone_cmd == DISK_ZONE_REPORT_ZONES) free(ccb->csio.data_ptr, M_SCSIDA); } static void dadone(struct cam_periph *periph, union ccb *done_ccb) { struct da_softc *softc; struct ccb_scsiio *csio; u_int32_t priority; da_ccb_state state; softc = (struct da_softc *)periph->softc; priority = done_ccb->ccb_h.pinfo.priority; CAM_DEBUG(periph->path, CAM_DEBUG_TRACE, ("dadone\n")); csio = &done_ccb->csio; #if defined(BUF_TRACKING) || defined(FULL_BUF_TRACKING) if (csio->bio != NULL) biotrack(csio->bio, __func__); #endif state = csio->ccb_h.ccb_state & DA_CCB_TYPE_MASK; switch (state) { case DA_CCB_BUFFER_IO: case DA_CCB_DELETE: { struct bio *bp, *bp1; cam_periph_lock(periph); bp = (struct bio *)done_ccb->ccb_h.ccb_bp; if ((done_ccb->ccb_h.status & CAM_STATUS_MASK) != CAM_REQ_CMP) { int error; int sf; if ((csio->ccb_h.ccb_state & DA_CCB_RETRY_UA) != 0) sf = SF_RETRY_UA; else sf = 0; error = daerror(done_ccb, CAM_RETRY_SELTO, sf); if (error == ERESTART) { /* * A retry was scheduled, so * just return. */ cam_periph_unlock(periph); return; } bp = (struct bio *)done_ccb->ccb_h.ccb_bp; if (error != 0) { int queued_error; /* * return all queued I/O with EIO, so that * the client can retry these I/Os in the * proper order should it attempt to recover. */ queued_error = EIO; if (error == ENXIO && (softc->flags & DA_FLAG_PACK_INVALID)== 0) { /* * Catastrophic error. Mark our pack as * invalid. */ /* * XXX See if this is really a media * XXX change first? */ xpt_print(periph->path, "Invalidating pack\n"); softc->flags |= DA_FLAG_PACK_INVALID; #ifdef CAM_IO_STATS softc->invalidations++; #endif queued_error = ENXIO; } cam_iosched_flush(softc->cam_iosched, NULL, queued_error); if (bp != NULL) { bp->bio_error = error; bp->bio_resid = bp->bio_bcount; bp->bio_flags |= BIO_ERROR; } } else if (bp != NULL) { if (state == DA_CCB_DELETE) bp->bio_resid = 0; else bp->bio_resid = csio->resid; bp->bio_error = 0; if (bp->bio_resid != 0) bp->bio_flags |= BIO_ERROR; } if ((done_ccb->ccb_h.status & CAM_DEV_QFRZN) != 0) cam_release_devq(done_ccb->ccb_h.path, /*relsim_flags*/0, /*reduction*/0, /*timeout*/0, /*getcount_only*/0); } else if (bp != NULL) { if ((done_ccb->ccb_h.status & CAM_DEV_QFRZN) != 0) panic("REQ_CMP with QFRZN"); if (bp->bio_cmd == BIO_ZONE) dazonedone(periph, done_ccb); else if (state == DA_CCB_DELETE) bp->bio_resid = 0; else bp->bio_resid = csio->resid; if ((csio->resid > 0) && (bp->bio_cmd != BIO_ZONE)) bp->bio_flags |= BIO_ERROR; if (softc->error_inject != 0) { bp->bio_error = softc->error_inject; bp->bio_resid = bp->bio_bcount; bp->bio_flags |= BIO_ERROR; softc->error_inject = 0; } } if (bp != NULL) biotrack(bp, __func__); LIST_REMOVE(&done_ccb->ccb_h, periph_links.le); if (LIST_EMPTY(&softc->pending_ccbs)) softc->flags |= DA_FLAG_WAS_OTAG; /* * We need to call cam_iosched before we call biodone so that we * don't measure any activity that happens in the completion * routine, which in the case of sendfile can be quite * extensive. */ cam_iosched_bio_complete(softc->cam_iosched, bp, done_ccb); xpt_release_ccb(done_ccb); if (state == DA_CCB_DELETE) { TAILQ_HEAD(, bio) queue; TAILQ_INIT(&queue); TAILQ_CONCAT(&queue, &softc->delete_run_queue.queue, bio_queue); softc->delete_run_queue.insert_point = NULL; /* * Normally, the xpt_release_ccb() above would make sure * that when we have more work to do, that work would * get kicked off. However, we specifically keep * delete_running set to 0 before the call above to * allow other I/O to progress when many BIO_DELETE * requests are pushed down. We set delete_running to 0 * and call daschedule again so that we don't stall if * there are no other I/Os pending apart from BIO_DELETEs. */ cam_iosched_trim_done(softc->cam_iosched); daschedule(periph); cam_periph_unlock(periph); while ((bp1 = TAILQ_FIRST(&queue)) != NULL) { TAILQ_REMOVE(&queue, bp1, bio_queue); bp1->bio_error = bp->bio_error; if (bp->bio_flags & BIO_ERROR) { bp1->bio_flags |= BIO_ERROR; bp1->bio_resid = bp1->bio_bcount; } else bp1->bio_resid = 0; biodone(bp1); } } else { daschedule(periph); cam_periph_unlock(periph); } if (bp != NULL) biodone(bp); return; } case DA_CCB_PROBE_WP: { struct scsi_mode_header_6 *mode_hdr6; struct scsi_mode_header_10 *mode_hdr10; uint8_t dev_spec; if (softc->minimum_cmd_size > 6) { mode_hdr10 = (struct scsi_mode_header_10 *)csio->data_ptr; dev_spec = mode_hdr10->dev_spec; } else { mode_hdr6 = (struct scsi_mode_header_6 *)csio->data_ptr; dev_spec = mode_hdr6->dev_spec; } if (cam_ccb_status(done_ccb) == CAM_REQ_CMP) { if ((dev_spec & 0x80) != 0) softc->disk->d_flags |= DISKFLAG_WRITE_PROTECT; else softc->disk->d_flags &= ~DISKFLAG_WRITE_PROTECT; } else { int error; error = daerror(done_ccb, CAM_RETRY_SELTO, SF_RETRY_UA|SF_NO_PRINT); if (error == ERESTART) return; else if (error != 0) { if ((done_ccb->ccb_h.status & CAM_DEV_QFRZN) != 0) { /* Don't wedge this device's queue */ cam_release_devq(done_ccb->ccb_h.path, /*relsim_flags*/0, /*reduction*/0, /*timeout*/0, /*getcount_only*/0); } } } free(csio->data_ptr, M_SCSIDA); xpt_release_ccb(done_ccb); if ((softc->flags & DA_FLAG_CAN_RC16) != 0) softc->state = DA_STATE_PROBE_RC16; else softc->state = DA_STATE_PROBE_RC; xpt_schedule(periph, priority); return; } case DA_CCB_PROBE_RC: case DA_CCB_PROBE_RC16: { struct scsi_read_capacity_data *rdcap; struct scsi_read_capacity_data_long *rcaplong; char *announce_buf; int lbp; lbp = 0; rdcap = NULL; rcaplong = NULL; /* XXX TODO: can this be a malloc? */ announce_buf = softc->announce_temp; bzero(announce_buf, DA_ANNOUNCETMP_SZ); if (state == DA_CCB_PROBE_RC) rdcap =(struct scsi_read_capacity_data *)csio->data_ptr; else rcaplong = (struct scsi_read_capacity_data_long *) csio->data_ptr; if ((csio->ccb_h.status & CAM_STATUS_MASK) == CAM_REQ_CMP) { struct disk_params *dp; uint32_t block_size; uint64_t maxsector; u_int lalba; /* Lowest aligned LBA. */ if (state == DA_CCB_PROBE_RC) { block_size = scsi_4btoul(rdcap->length); maxsector = scsi_4btoul(rdcap->addr); lalba = 0; /* * According to SBC-2, if the standard 10 * byte READ CAPACITY command returns 2^32, * we should issue the 16 byte version of * the command, since the device in question * has more sectors than can be represented * with the short version of the command. */ if (maxsector == 0xffffffff) { free(rdcap, M_SCSIDA); xpt_release_ccb(done_ccb); softc->state = DA_STATE_PROBE_RC16; xpt_schedule(periph, priority); return; } } else { block_size = scsi_4btoul(rcaplong->length); maxsector = scsi_8btou64(rcaplong->addr); lalba = scsi_2btoul(rcaplong->lalba_lbp); } /* * Because GEOM code just will panic us if we * give them an 'illegal' value we'll avoid that * here. */ if (block_size == 0) { block_size = 512; if (maxsector == 0) maxsector = -1; } if (block_size >= MAXPHYS) { xpt_print(periph->path, "unsupportable block size %ju\n", (uintmax_t) block_size); announce_buf = NULL; cam_periph_invalidate(periph); } else { /* * We pass rcaplong into dasetgeom(), * because it will only use it if it is * non-NULL. */ dasetgeom(periph, block_size, maxsector, rcaplong, sizeof(*rcaplong)); lbp = (lalba & SRC16_LBPME_A); dp = &softc->params; snprintf(announce_buf, DA_ANNOUNCETMP_SZ, "%juMB (%ju %u byte sectors)", ((uintmax_t)dp->secsize * dp->sectors) / (1024 * 1024), (uintmax_t)dp->sectors, dp->secsize); } } else { int error; /* * Retry any UNIT ATTENTION type errors. They * are expected at boot. */ error = daerror(done_ccb, CAM_RETRY_SELTO, SF_RETRY_UA|SF_NO_PRINT); if (error == ERESTART) { /* * A retry was scheuled, so * just return. */ return; } else if (error != 0) { int asc, ascq; int sense_key, error_code; int have_sense; cam_status status; struct ccb_getdev cgd; /* Don't wedge this device's queue */ status = done_ccb->ccb_h.status; if ((status & CAM_DEV_QFRZN) != 0) cam_release_devq(done_ccb->ccb_h.path, /*relsim_flags*/0, /*reduction*/0, /*timeout*/0, /*getcount_only*/0); xpt_setup_ccb(&cgd.ccb_h, done_ccb->ccb_h.path, CAM_PRIORITY_NORMAL); cgd.ccb_h.func_code = XPT_GDEV_TYPE; xpt_action((union ccb *)&cgd); if (scsi_extract_sense_ccb(done_ccb, &error_code, &sense_key, &asc, &ascq)) have_sense = TRUE; else have_sense = FALSE; /* * If we tried READ CAPACITY(16) and failed, * fallback to READ CAPACITY(10). */ if ((state == DA_CCB_PROBE_RC16) && (softc->flags & DA_FLAG_CAN_RC16) && (((csio->ccb_h.status & CAM_STATUS_MASK) == CAM_REQ_INVALID) || ((have_sense) && (error_code == SSD_CURRENT_ERROR) && (sense_key == SSD_KEY_ILLEGAL_REQUEST)))) { softc->flags &= ~DA_FLAG_CAN_RC16; free(rdcap, M_SCSIDA); xpt_release_ccb(done_ccb); softc->state = DA_STATE_PROBE_RC; xpt_schedule(periph, priority); return; } /* * Attach to anything that claims to be a * direct access or optical disk device, * as long as it doesn't return a "Logical * unit not supported" (0x25) error. * "Internal Target Failure" (0x44) is also * special and typically means that the * device is a SATA drive behind a SATL * translation that's fallen into a * terminally fatal state. */ if ((have_sense) && (asc != 0x25) && (asc != 0x44) && (error_code == SSD_CURRENT_ERROR)) { const char *sense_key_desc; const char *asc_desc; dasetgeom(periph, 512, -1, NULL, 0); scsi_sense_desc(sense_key, asc, ascq, &cgd.inq_data, &sense_key_desc, &asc_desc); snprintf(announce_buf, DA_ANNOUNCETMP_SZ, "Attempt to query device " "size failed: %s, %s", sense_key_desc, asc_desc); } else { if (have_sense) scsi_sense_print( &done_ccb->csio); else { xpt_print(periph->path, "got CAM status %#x\n", done_ccb->ccb_h.status); } xpt_print(periph->path, "fatal error, " "failed to attach to device\n"); announce_buf = NULL; /* * Free up resources. */ cam_periph_invalidate(periph); } } } free(csio->data_ptr, M_SCSIDA); if (announce_buf != NULL && ((softc->flags & DA_FLAG_ANNOUNCED) == 0)) { struct sbuf sb; sbuf_new(&sb, softc->announcebuf, DA_ANNOUNCE_SZ, SBUF_FIXEDLEN); xpt_announce_periph_sbuf(periph, &sb, announce_buf); xpt_announce_quirks_sbuf(periph, &sb, softc->quirks, DA_Q_BIT_STRING); sbuf_finish(&sb); sbuf_putbuf(&sb); /* * Create our sysctl variables, now that we know * we have successfully attached. */ /* increase the refcount */ if (da_periph_acquire(periph, DA_REF_SYSCTL) == 0) { taskqueue_enqueue(taskqueue_thread, &softc->sysctl_task); } else { /* XXX This message is useless! */ xpt_print(periph->path, "fatal error, " "could not acquire reference count\n"); } } /* We already probed the device. */ if (softc->flags & DA_FLAG_PROBED) { daprobedone(periph, done_ccb); return; } /* Ensure re-probe doesn't see old delete. */ softc->delete_available = 0; dadeleteflag(softc, DA_DELETE_ZERO, 1); if (lbp && (softc->quirks & DA_Q_NO_UNMAP) == 0) { /* * Based on older SBC-3 spec revisions * any of the UNMAP methods "may" be * available via LBP given this flag so * we flag all of them as available and * then remove those which further * probes confirm aren't available * later. * * We could also check readcap(16) p_type * flag to exclude one or more invalid * write same (X) types here */ dadeleteflag(softc, DA_DELETE_WS16, 1); dadeleteflag(softc, DA_DELETE_WS10, 1); dadeleteflag(softc, DA_DELETE_UNMAP, 1); xpt_release_ccb(done_ccb); softc->state = DA_STATE_PROBE_LBP; xpt_schedule(periph, priority); return; } xpt_release_ccb(done_ccb); softc->state = DA_STATE_PROBE_BDC; xpt_schedule(periph, priority); return; } case DA_CCB_PROBE_LBP: { struct scsi_vpd_logical_block_prov *lbp; lbp = (struct scsi_vpd_logical_block_prov *)csio->data_ptr; if ((csio->ccb_h.status & CAM_STATUS_MASK) == CAM_REQ_CMP) { /* * T10/1799-D Revision 31 states at least one of these * must be supported but we don't currently enforce this. */ dadeleteflag(softc, DA_DELETE_WS16, (lbp->flags & SVPD_LBP_WS16)); dadeleteflag(softc, DA_DELETE_WS10, (lbp->flags & SVPD_LBP_WS10)); dadeleteflag(softc, DA_DELETE_UNMAP, (lbp->flags & SVPD_LBP_UNMAP)); } else { int error; error = daerror(done_ccb, CAM_RETRY_SELTO, SF_RETRY_UA|SF_NO_PRINT); if (error == ERESTART) return; else if (error != 0) { if ((done_ccb->ccb_h.status & CAM_DEV_QFRZN) != 0) { /* Don't wedge this device's queue */ cam_release_devq(done_ccb->ccb_h.path, /*relsim_flags*/0, /*reduction*/0, /*timeout*/0, /*getcount_only*/0); } /* * Failure indicates we don't support any SBC-3 * delete methods with UNMAP */ } } free(lbp, M_SCSIDA); xpt_release_ccb(done_ccb); softc->state = DA_STATE_PROBE_BLK_LIMITS; xpt_schedule(periph, priority); return; } case DA_CCB_PROBE_BLK_LIMITS: { struct scsi_vpd_block_limits *block_limits; block_limits = (struct scsi_vpd_block_limits *)csio->data_ptr; if ((csio->ccb_h.status & CAM_STATUS_MASK) == CAM_REQ_CMP) { uint32_t max_txfer_len = scsi_4btoul( block_limits->max_txfer_len); uint32_t max_unmap_lba_cnt = scsi_4btoul( block_limits->max_unmap_lba_cnt); uint32_t max_unmap_blk_cnt = scsi_4btoul( block_limits->max_unmap_blk_cnt); uint32_t unmap_gran = scsi_4btoul( block_limits->opt_unmap_grain); uint32_t unmap_gran_align = scsi_4btoul( block_limits->unmap_grain_align); uint64_t ws_max_blks = scsi_8btou64( block_limits->max_write_same_length); if (max_txfer_len != 0) { softc->disk->d_maxsize = MIN(softc->maxio, (off_t)max_txfer_len * softc->params.secsize); } /* * We should already support UNMAP but we check lba * and block count to be sure */ if (max_unmap_lba_cnt != 0x00L && max_unmap_blk_cnt != 0x00L) { softc->unmap_max_lba = max_unmap_lba_cnt; softc->unmap_max_ranges = min(max_unmap_blk_cnt, UNMAP_MAX_RANGES); if (unmap_gran > 1) { softc->unmap_gran = unmap_gran; if (unmap_gran_align & 0x80000000) { softc->unmap_gran_align = unmap_gran_align & 0x7fffffff; } } } else { /* * Unexpected UNMAP limits which means the * device doesn't actually support UNMAP */ dadeleteflag(softc, DA_DELETE_UNMAP, 0); } if (ws_max_blks != 0x00L) softc->ws_max_blks = ws_max_blks; } else { int error; error = daerror(done_ccb, CAM_RETRY_SELTO, SF_RETRY_UA|SF_NO_PRINT); if (error == ERESTART) return; else if (error != 0) { if ((done_ccb->ccb_h.status & CAM_DEV_QFRZN) != 0) { /* Don't wedge this device's queue */ cam_release_devq(done_ccb->ccb_h.path, /*relsim_flags*/0, /*reduction*/0, /*timeout*/0, /*getcount_only*/0); } /* * Failure here doesn't mean UNMAP is not * supported as this is an optional page. */ softc->unmap_max_lba = 1; softc->unmap_max_ranges = 1; } } free(block_limits, M_SCSIDA); xpt_release_ccb(done_ccb); softc->state = DA_STATE_PROBE_BDC; xpt_schedule(periph, priority); return; } case DA_CCB_PROBE_BDC: { struct scsi_vpd_block_device_characteristics *bdc; bdc = (struct scsi_vpd_block_device_characteristics *) csio->data_ptr; if ((csio->ccb_h.status & CAM_STATUS_MASK) == CAM_REQ_CMP) { uint32_t valid_len; /* * Disable queue sorting for non-rotational media * by default. */ u_int16_t old_rate = softc->disk->d_rotation_rate; valid_len = csio->dxfer_len - csio->resid; if (SBDC_IS_PRESENT(bdc, valid_len, medium_rotation_rate)) { softc->disk->d_rotation_rate = scsi_2btoul(bdc->medium_rotation_rate); if (softc->disk->d_rotation_rate == SVPD_BDC_RATE_NON_ROTATING) { cam_iosched_set_sort_queue( softc->cam_iosched, 0); softc->rotating = 0; } if (softc->disk->d_rotation_rate != old_rate) { disk_attr_changed(softc->disk, "GEOM::rotation_rate", M_NOWAIT); } } if ((SBDC_IS_PRESENT(bdc, valid_len, flags)) && (softc->zone_mode == DA_ZONE_NONE)) { int ata_proto; if (scsi_vpd_supported_page(periph, SVPD_ATA_INFORMATION)) ata_proto = 1; else ata_proto = 0; /* * The Zoned field will only be set for * Drive Managed and Host Aware drives. If * they are Host Managed, the device type * in the standard INQUIRY data should be * set to T_ZBC_HM (0x14). */ if ((bdc->flags & SVPD_ZBC_MASK) == SVPD_HAW_ZBC) { softc->zone_mode = DA_ZONE_HOST_AWARE; softc->zone_interface = (ata_proto) ? DA_ZONE_IF_ATA_SAT : DA_ZONE_IF_SCSI; } else if ((bdc->flags & SVPD_ZBC_MASK) == SVPD_DM_ZBC) { softc->zone_mode =DA_ZONE_DRIVE_MANAGED; softc->zone_interface = (ata_proto) ? DA_ZONE_IF_ATA_SAT : DA_ZONE_IF_SCSI; } else if ((bdc->flags & SVPD_ZBC_MASK) != SVPD_ZBC_NR) { xpt_print(periph->path, "Unknown zoned " "type %#x", bdc->flags & SVPD_ZBC_MASK); } } } else { int error; error = daerror(done_ccb, CAM_RETRY_SELTO, SF_RETRY_UA|SF_NO_PRINT); if (error == ERESTART) return; else if (error != 0) { if ((done_ccb->ccb_h.status & CAM_DEV_QFRZN) != 0) { /* Don't wedge this device's queue */ cam_release_devq(done_ccb->ccb_h.path, /*relsim_flags*/0, /*reduction*/0, /*timeout*/0, /*getcount_only*/0); } } } free(bdc, M_SCSIDA); xpt_release_ccb(done_ccb); softc->state = DA_STATE_PROBE_ATA; xpt_schedule(periph, priority); return; } case DA_CCB_PROBE_ATA: { int i; struct ata_params *ata_params; int continue_probe; int error; int16_t *ptr; ata_params = (struct ata_params *)csio->data_ptr; ptr = (uint16_t *)ata_params; continue_probe = 0; error = 0; if ((csio->ccb_h.status & CAM_STATUS_MASK) == CAM_REQ_CMP) { uint16_t old_rate; for (i = 0; i < sizeof(*ata_params) / 2; i++) ptr[i] = le16toh(ptr[i]); if (ata_params->support_dsm & ATA_SUPPORT_DSM_TRIM && (softc->quirks & DA_Q_NO_UNMAP) == 0) { dadeleteflag(softc, DA_DELETE_ATA_TRIM, 1); if (ata_params->max_dsm_blocks != 0) softc->trim_max_ranges = min( softc->trim_max_ranges, ata_params->max_dsm_blocks * ATA_DSM_BLK_RANGES); } /* * Disable queue sorting for non-rotational media * by default. */ old_rate = softc->disk->d_rotation_rate; softc->disk->d_rotation_rate = ata_params->media_rotation_rate; if (softc->disk->d_rotation_rate == ATA_RATE_NON_ROTATING) { cam_iosched_set_sort_queue(softc->cam_iosched, 0); softc->rotating = 0; } if (softc->disk->d_rotation_rate != old_rate) { disk_attr_changed(softc->disk, "GEOM::rotation_rate", M_NOWAIT); } if (ata_params->capabilities1 & ATA_SUPPORT_DMA) softc->flags |= DA_FLAG_CAN_ATA_DMA; if (ata_params->support.extension & ATA_SUPPORT_GENLOG) softc->flags |= DA_FLAG_CAN_ATA_LOG; /* * At this point, if we have a SATA host aware drive, * we communicate via ATA passthrough unless the * SAT layer supports ZBC -> ZAC translation. In * that case, */ /* * XXX KDM figure out how to detect a host managed * SATA drive. */ if (softc->zone_mode == DA_ZONE_NONE) { /* * Note that we don't override the zone * mode or interface if it has already been * set. This is because it has either been * set as a quirk, or when we probed the * SCSI Block Device Characteristics page, * the zoned field was set. The latter * means that the SAT layer supports ZBC to * ZAC translation, and we would prefer to * use that if it is available. */ if ((ata_params->support3 & ATA_SUPPORT_ZONE_MASK) == ATA_SUPPORT_ZONE_HOST_AWARE) { softc->zone_mode = DA_ZONE_HOST_AWARE; softc->zone_interface = DA_ZONE_IF_ATA_PASS; } else if ((ata_params->support3 & ATA_SUPPORT_ZONE_MASK) == ATA_SUPPORT_ZONE_DEV_MANAGED) { softc->zone_mode =DA_ZONE_DRIVE_MANAGED; softc->zone_interface = DA_ZONE_IF_ATA_PASS; } } } else { error = daerror(done_ccb, CAM_RETRY_SELTO, SF_RETRY_UA|SF_NO_PRINT); if (error == ERESTART) return; else if (error != 0) { if ((done_ccb->ccb_h.status & CAM_DEV_QFRZN) != 0) { /* Don't wedge this device's queue */ cam_release_devq(done_ccb->ccb_h.path, /*relsim_flags*/0, /*reduction*/0, /*timeout*/0, /*getcount_only*/0); } } } free(ata_params, M_SCSIDA); if ((softc->zone_mode == DA_ZONE_HOST_AWARE) || (softc->zone_mode == DA_ZONE_HOST_MANAGED)) { /* * If the ATA IDENTIFY failed, we could be talking * to a SCSI drive, although that seems unlikely, * since the drive did report that it supported the * ATA Information VPD page. If the ATA IDENTIFY * succeeded, and the SAT layer doesn't support * ZBC -> ZAC translation, continue on to get the * directory of ATA logs, and complete the rest of * the ZAC probe. If the SAT layer does support * ZBC -> ZAC translation, we want to use that, * and we'll probe the SCSI Zoned Block Device * Characteristics VPD page next. */ if ((error == 0) && (softc->flags & DA_FLAG_CAN_ATA_LOG) && (softc->zone_interface == DA_ZONE_IF_ATA_PASS)) softc->state = DA_STATE_PROBE_ATA_LOGDIR; else softc->state = DA_STATE_PROBE_ZONE; continue_probe = 1; } if (continue_probe != 0) { xpt_release_ccb(done_ccb); xpt_schedule(periph, priority); return; } else daprobedone(periph, done_ccb); return; } case DA_CCB_PROBE_ATA_LOGDIR: { int error; if ((csio->ccb_h.status & CAM_STATUS_MASK) == CAM_REQ_CMP) { error = 0; softc->valid_logdir_len = 0; bzero(&softc->ata_logdir, sizeof(softc->ata_logdir)); softc->valid_logdir_len = csio->dxfer_len - csio->resid; if (softc->valid_logdir_len > 0) bcopy(csio->data_ptr, &softc->ata_logdir, min(softc->valid_logdir_len, sizeof(softc->ata_logdir))); /* * Figure out whether the Identify Device log is * supported. The General Purpose log directory * has a header, and lists the number of pages * available for each GP log identified by the * offset into the list. */ if ((softc->valid_logdir_len >= ((ATA_IDENTIFY_DATA_LOG + 1) * sizeof(uint16_t))) && (le16dec(softc->ata_logdir.header) == ATA_GP_LOG_DIR_VERSION) && (le16dec(&softc->ata_logdir.num_pages[ (ATA_IDENTIFY_DATA_LOG * sizeof(uint16_t)) - sizeof(uint16_t)]) > 0)){ softc->flags |= DA_FLAG_CAN_ATA_IDLOG; } else { softc->flags &= ~DA_FLAG_CAN_ATA_IDLOG; } } else { error = daerror(done_ccb, CAM_RETRY_SELTO, SF_RETRY_UA|SF_NO_PRINT); if (error == ERESTART) return; else if (error != 0) { /* * If we can't get the ATA log directory, * then ATA logs are effectively not * supported even if the bit is set in the * identify data. */ softc->flags &= ~(DA_FLAG_CAN_ATA_LOG | DA_FLAG_CAN_ATA_IDLOG); if ((done_ccb->ccb_h.status & CAM_DEV_QFRZN) != 0) { /* Don't wedge this device's queue */ cam_release_devq(done_ccb->ccb_h.path, /*relsim_flags*/0, /*reduction*/0, /*timeout*/0, /*getcount_only*/0); } } } free(csio->data_ptr, M_SCSIDA); if ((error == 0) && (softc->flags & DA_FLAG_CAN_ATA_IDLOG)) { softc->state = DA_STATE_PROBE_ATA_IDDIR; xpt_release_ccb(done_ccb); xpt_schedule(periph, priority); return; } daprobedone(periph, done_ccb); return; } case DA_CCB_PROBE_ATA_IDDIR: { int error; if ((csio->ccb_h.status & CAM_STATUS_MASK) == CAM_REQ_CMP) { off_t entries_offset, max_entries; error = 0; softc->valid_iddir_len = 0; bzero(&softc->ata_iddir, sizeof(softc->ata_iddir)); softc->flags &= ~(DA_FLAG_CAN_ATA_SUPCAP | DA_FLAG_CAN_ATA_ZONE); softc->valid_iddir_len = csio->dxfer_len - csio->resid; if (softc->valid_iddir_len > 0) bcopy(csio->data_ptr, &softc->ata_iddir, min(softc->valid_iddir_len, sizeof(softc->ata_iddir))); entries_offset = __offsetof(struct ata_identify_log_pages,entries); max_entries = softc->valid_iddir_len - entries_offset; if ((softc->valid_iddir_len > (entries_offset + 1)) && (le64dec(softc->ata_iddir.header) == ATA_IDLOG_REVISION) && (softc->ata_iddir.entry_count > 0)) { int num_entries, i; num_entries = softc->ata_iddir.entry_count; num_entries = min(num_entries, softc->valid_iddir_len - entries_offset); for (i = 0; i < num_entries && i < max_entries; i++) { if (softc->ata_iddir.entries[i] == ATA_IDL_SUP_CAP) softc->flags |= DA_FLAG_CAN_ATA_SUPCAP; else if (softc->ata_iddir.entries[i]== ATA_IDL_ZDI) softc->flags |= DA_FLAG_CAN_ATA_ZONE; if ((softc->flags & DA_FLAG_CAN_ATA_SUPCAP) && (softc->flags & DA_FLAG_CAN_ATA_ZONE)) break; } } } else { error = daerror(done_ccb, CAM_RETRY_SELTO, SF_RETRY_UA|SF_NO_PRINT); if (error == ERESTART) return; else if (error != 0) { /* * If we can't get the ATA Identify Data log * directory, then it effectively isn't * supported even if the ATA Log directory * a non-zero number of pages present for * this log. */ softc->flags &= ~DA_FLAG_CAN_ATA_IDLOG; if ((done_ccb->ccb_h.status & CAM_DEV_QFRZN) != 0) { /* Don't wedge this device's queue */ cam_release_devq(done_ccb->ccb_h.path, /*relsim_flags*/0, /*reduction*/0, /*timeout*/0, /*getcount_only*/0); } } } free(csio->data_ptr, M_SCSIDA); if ((error == 0) && (softc->flags & DA_FLAG_CAN_ATA_SUPCAP)) { softc->state = DA_STATE_PROBE_ATA_SUP; xpt_release_ccb(done_ccb); xpt_schedule(periph, priority); return; } daprobedone(periph, done_ccb); return; } case DA_CCB_PROBE_ATA_SUP: { int error; if ((csio->ccb_h.status & CAM_STATUS_MASK) == CAM_REQ_CMP) { uint32_t valid_len; size_t needed_size; struct ata_identify_log_sup_cap *sup_cap; error = 0; sup_cap = (struct ata_identify_log_sup_cap *) csio->data_ptr; valid_len = csio->dxfer_len - csio->resid; needed_size = __offsetof(struct ata_identify_log_sup_cap, sup_zac_cap) + 1 + sizeof(sup_cap->sup_zac_cap); if (valid_len >= needed_size) { uint64_t zoned, zac_cap; zoned = le64dec(sup_cap->zoned_cap); if (zoned & ATA_ZONED_VALID) { /* * This should have already been * set, because this is also in the * ATA identify data. */ if ((zoned & ATA_ZONED_MASK) == ATA_SUPPORT_ZONE_HOST_AWARE) softc->zone_mode = DA_ZONE_HOST_AWARE; else if ((zoned & ATA_ZONED_MASK) == ATA_SUPPORT_ZONE_DEV_MANAGED) softc->zone_mode = DA_ZONE_DRIVE_MANAGED; } zac_cap = le64dec(sup_cap->sup_zac_cap); if (zac_cap & ATA_SUP_ZAC_CAP_VALID) { if (zac_cap & ATA_REPORT_ZONES_SUP) softc->zone_flags |= DA_ZONE_FLAG_RZ_SUP; if (zac_cap & ATA_ND_OPEN_ZONE_SUP) softc->zone_flags |= DA_ZONE_FLAG_OPEN_SUP; if (zac_cap & ATA_ND_CLOSE_ZONE_SUP) softc->zone_flags |= DA_ZONE_FLAG_CLOSE_SUP; if (zac_cap & ATA_ND_FINISH_ZONE_SUP) softc->zone_flags |= DA_ZONE_FLAG_FINISH_SUP; if (zac_cap & ATA_ND_RWP_SUP) softc->zone_flags |= DA_ZONE_FLAG_RWP_SUP; } else { /* * This field was introduced in * ACS-4, r08 on April 28th, 2015. * If the drive firmware was written * to an earlier spec, it won't have * the field. So, assume all * commands are supported. */ softc->zone_flags |= DA_ZONE_FLAG_SUP_MASK; } } } else { error = daerror(done_ccb, CAM_RETRY_SELTO, SF_RETRY_UA|SF_NO_PRINT); if (error == ERESTART) return; else if (error != 0) { /* * If we can't get the ATA Identify Data * Supported Capabilities page, clear the * flag... */ softc->flags &= ~DA_FLAG_CAN_ATA_SUPCAP; /* * And clear zone capabilities. */ softc->zone_flags &= ~DA_ZONE_FLAG_SUP_MASK; if ((done_ccb->ccb_h.status & CAM_DEV_QFRZN) != 0) { /* Don't wedge this device's queue */ cam_release_devq(done_ccb->ccb_h.path, /*relsim_flags*/0, /*reduction*/0, /*timeout*/0, /*getcount_only*/0); } } } free(csio->data_ptr, M_SCSIDA); if ((error == 0) && (softc->flags & DA_FLAG_CAN_ATA_ZONE)) { softc->state = DA_STATE_PROBE_ATA_ZONE; xpt_release_ccb(done_ccb); xpt_schedule(periph, priority); return; } daprobedone(periph, done_ccb); return; } case DA_CCB_PROBE_ATA_ZONE: { int error; if ((csio->ccb_h.status & CAM_STATUS_MASK) == CAM_REQ_CMP) { struct ata_zoned_info_log *zi_log; uint32_t valid_len; size_t needed_size; zi_log = (struct ata_zoned_info_log *)csio->data_ptr; valid_len = csio->dxfer_len - csio->resid; needed_size = __offsetof(struct ata_zoned_info_log, version_info) + 1 + sizeof(zi_log->version_info); if (valid_len >= needed_size) { uint64_t tmpvar; tmpvar = le64dec(zi_log->zoned_cap); if (tmpvar & ATA_ZDI_CAP_VALID) { if (tmpvar & ATA_ZDI_CAP_URSWRZ) softc->zone_flags |= DA_ZONE_FLAG_URSWRZ; else softc->zone_flags &= ~DA_ZONE_FLAG_URSWRZ; } tmpvar = le64dec(zi_log->optimal_seq_zones); if (tmpvar & ATA_ZDI_OPT_SEQ_VALID) { softc->zone_flags |= DA_ZONE_FLAG_OPT_SEQ_SET; softc->optimal_seq_zones = (tmpvar & ATA_ZDI_OPT_SEQ_MASK); } else { softc->zone_flags &= ~DA_ZONE_FLAG_OPT_SEQ_SET; softc->optimal_seq_zones = 0; } tmpvar =le64dec(zi_log->optimal_nonseq_zones); if (tmpvar & ATA_ZDI_OPT_NS_VALID) { softc->zone_flags |= DA_ZONE_FLAG_OPT_NONSEQ_SET; softc->optimal_nonseq_zones = (tmpvar & ATA_ZDI_OPT_NS_MASK); } else { softc->zone_flags &= ~DA_ZONE_FLAG_OPT_NONSEQ_SET; softc->optimal_nonseq_zones = 0; } tmpvar = le64dec(zi_log->max_seq_req_zones); if (tmpvar & ATA_ZDI_MAX_SEQ_VALID) { softc->zone_flags |= DA_ZONE_FLAG_MAX_SEQ_SET; softc->max_seq_zones = (tmpvar & ATA_ZDI_MAX_SEQ_MASK); } else { softc->zone_flags &= ~DA_ZONE_FLAG_MAX_SEQ_SET; softc->max_seq_zones = 0; } } } else { error = daerror(done_ccb, CAM_RETRY_SELTO, SF_RETRY_UA|SF_NO_PRINT); if (error == ERESTART) return; else if (error != 0) { softc->flags &= ~DA_FLAG_CAN_ATA_ZONE; softc->flags &= ~DA_ZONE_FLAG_SET_MASK; if ((done_ccb->ccb_h.status & CAM_DEV_QFRZN) != 0) { /* Don't wedge this device's queue */ cam_release_devq(done_ccb->ccb_h.path, /*relsim_flags*/0, /*reduction*/0, /*timeout*/0, /*getcount_only*/0); } } } free(csio->data_ptr, M_SCSIDA); daprobedone(periph, done_ccb); return; } case DA_CCB_PROBE_ZONE: { int error; if ((csio->ccb_h.status & CAM_STATUS_MASK) == CAM_REQ_CMP) { uint32_t valid_len; size_t needed_len; struct scsi_vpd_zoned_bdc *zoned_bdc; error = 0; zoned_bdc = (struct scsi_vpd_zoned_bdc *) csio->data_ptr; valid_len = csio->dxfer_len - csio->resid; needed_len = __offsetof(struct scsi_vpd_zoned_bdc, max_seq_req_zones) + 1 + sizeof(zoned_bdc->max_seq_req_zones); if ((valid_len >= needed_len) && (scsi_2btoul(zoned_bdc->page_length) >= SVPD_ZBDC_PL)) { if (zoned_bdc->flags & SVPD_ZBDC_URSWRZ) softc->zone_flags |= DA_ZONE_FLAG_URSWRZ; else softc->zone_flags &= ~DA_ZONE_FLAG_URSWRZ; softc->optimal_seq_zones = scsi_4btoul(zoned_bdc->optimal_seq_zones); softc->zone_flags |= DA_ZONE_FLAG_OPT_SEQ_SET; softc->optimal_nonseq_zones = scsi_4btoul( zoned_bdc->optimal_nonseq_zones); softc->zone_flags |= DA_ZONE_FLAG_OPT_NONSEQ_SET; softc->max_seq_zones = scsi_4btoul(zoned_bdc->max_seq_req_zones); softc->zone_flags |= DA_ZONE_FLAG_MAX_SEQ_SET; } /* * All of the zone commands are mandatory for SCSI * devices. * * XXX KDM this is valid as of September 2015. * Re-check this assumption once the SAT spec is * updated to support SCSI ZBC to ATA ZAC mapping. * Since ATA allows zone commands to be reported * as supported or not, this may not necessarily * be true for an ATA device behind a SAT (SCSI to * ATA Translation) layer. */ softc->zone_flags |= DA_ZONE_FLAG_SUP_MASK; } else { error = daerror(done_ccb, CAM_RETRY_SELTO, SF_RETRY_UA|SF_NO_PRINT); if (error == ERESTART) return; else if (error != 0) { if ((done_ccb->ccb_h.status & CAM_DEV_QFRZN) != 0) { /* Don't wedge this device's queue */ cam_release_devq(done_ccb->ccb_h.path, /*relsim_flags*/0, /*reduction*/0, /*timeout*/0, /*getcount_only*/0); } } } daprobedone(periph, done_ccb); return; } case DA_CCB_DUMP: /* No-op. We're polling */ return; case DA_CCB_TUR: { if ((done_ccb->ccb_h.status & CAM_STATUS_MASK) != CAM_REQ_CMP) { if (daerror(done_ccb, CAM_RETRY_SELTO, SF_RETRY_UA | SF_NO_RECOVERY | SF_NO_PRINT) == ERESTART) return; if ((done_ccb->ccb_h.status & CAM_DEV_QFRZN) != 0) cam_release_devq(done_ccb->ccb_h.path, /*relsim_flags*/0, /*reduction*/0, /*timeout*/0, /*getcount_only*/0); } xpt_release_ccb(done_ccb); da_periph_release_locked(periph, DA_REF_TUR); return; } default: break; } xpt_release_ccb(done_ccb); } static void dareprobe(struct cam_periph *periph) { struct da_softc *softc; int status; softc = (struct da_softc *)periph->softc; /* Probe in progress; don't interfere. */ if (softc->state != DA_STATE_NORMAL) return; status = da_periph_acquire(periph, DA_REF_REPROBE); KASSERT(status == 0, ("dareprobe: cam_periph_acquire failed")); softc->state = DA_STATE_PROBE_WP; xpt_schedule(periph, CAM_PRIORITY_DEV); } static int daerror(union ccb *ccb, u_int32_t cam_flags, u_int32_t sense_flags) { struct da_softc *softc; struct cam_periph *periph; int error, error_code, sense_key, asc, ascq; #if defined(BUF_TRACKING) || defined(FULL_BUF_TRACKING) if (ccb->csio.bio != NULL) biotrack(ccb->csio.bio, __func__); #endif periph = xpt_path_periph(ccb->ccb_h.path); softc = (struct da_softc *)periph->softc; /* * Automatically detect devices that do not support * READ(6)/WRITE(6) and upgrade to using 10 byte cdbs. */ error = 0; if ((ccb->ccb_h.status & CAM_STATUS_MASK) == CAM_REQ_INVALID) { error = cmd6workaround(ccb); } else if (scsi_extract_sense_ccb(ccb, &error_code, &sense_key, &asc, &ascq)) { if (sense_key == SSD_KEY_ILLEGAL_REQUEST) error = cmd6workaround(ccb); /* * If the target replied with CAPACITY DATA HAS CHANGED UA, * query the capacity and notify upper layers. */ else if (sense_key == SSD_KEY_UNIT_ATTENTION && asc == 0x2A && ascq == 0x09) { xpt_print(periph->path, "Capacity data has changed\n"); softc->flags &= ~DA_FLAG_PROBED; dareprobe(periph); sense_flags |= SF_NO_PRINT; } else if (sense_key == SSD_KEY_UNIT_ATTENTION && asc == 0x28 && ascq == 0x00) { softc->flags &= ~DA_FLAG_PROBED; disk_media_changed(softc->disk, M_NOWAIT); } else if (sense_key == SSD_KEY_UNIT_ATTENTION && asc == 0x3F && ascq == 0x03) { xpt_print(periph->path, "INQUIRY data has changed\n"); softc->flags &= ~DA_FLAG_PROBED; dareprobe(periph); sense_flags |= SF_NO_PRINT; } else if (sense_key == SSD_KEY_NOT_READY && asc == 0x3a && (softc->flags & DA_FLAG_PACK_INVALID) == 0) { softc->flags |= DA_FLAG_PACK_INVALID; disk_media_gone(softc->disk, M_NOWAIT); } } if (error == ERESTART) return (ERESTART); #ifdef CAM_IO_STATS switch (ccb->ccb_h.status & CAM_STATUS_MASK) { case CAM_CMD_TIMEOUT: softc->timeouts++; break; case CAM_REQ_ABORTED: case CAM_REQ_CMP_ERR: case CAM_REQ_TERMIO: case CAM_UNREC_HBA_ERROR: case CAM_DATA_RUN_ERR: softc->errors++; break; default: break; } #endif /* * XXX * Until we have a better way of doing pack validation, * don't treat UAs as errors. */ sense_flags |= SF_RETRY_UA; if (softc->quirks & DA_Q_RETRY_BUSY) sense_flags |= SF_RETRY_BUSY; return(cam_periph_error(ccb, cam_flags, sense_flags)); } static void damediapoll(void *arg) { struct cam_periph *periph = arg; struct da_softc *softc = periph->softc; if (!cam_iosched_has_work_flags(softc->cam_iosched, DA_WORK_TUR) && LIST_EMPTY(&softc->pending_ccbs)) { if (da_periph_acquire(periph, DA_REF_TUR) == 0) { cam_iosched_set_work_flags(softc->cam_iosched, DA_WORK_TUR); daschedule(periph); } } /* Queue us up again */ if (da_poll_period != 0) callout_schedule(&softc->mediapoll_c, da_poll_period * hz); } static void daprevent(struct cam_periph *periph, int action) { struct da_softc *softc; union ccb *ccb; int error; softc = (struct da_softc *)periph->softc; if (((action == PR_ALLOW) && (softc->flags & DA_FLAG_PACK_LOCKED) == 0) || ((action == PR_PREVENT) && (softc->flags & DA_FLAG_PACK_LOCKED) != 0)) { return; } ccb = cam_periph_getccb(periph, CAM_PRIORITY_NORMAL); scsi_prevent(&ccb->csio, /*retries*/1, /*cbcfp*/dadone, MSG_SIMPLE_Q_TAG, action, SSD_FULL_SIZE, 5000); error = cam_periph_runccb(ccb, daerror, CAM_RETRY_SELTO, SF_RETRY_UA | SF_NO_PRINT, softc->disk->d_devstat); if (error == 0) { if (action == PR_ALLOW) softc->flags &= ~DA_FLAG_PACK_LOCKED; else softc->flags |= DA_FLAG_PACK_LOCKED; } xpt_release_ccb(ccb); } static void dasetgeom(struct cam_periph *periph, uint32_t block_len, uint64_t maxsector, struct scsi_read_capacity_data_long *rcaplong, size_t rcap_len) { struct ccb_calc_geometry ccg; struct da_softc *softc; struct disk_params *dp; u_int lbppbe, lalba; int error; softc = (struct da_softc *)periph->softc; dp = &softc->params; dp->secsize = block_len; dp->sectors = maxsector + 1; if (rcaplong != NULL) { lbppbe = rcaplong->prot_lbppbe & SRC16_LBPPBE; lalba = scsi_2btoul(rcaplong->lalba_lbp); lalba &= SRC16_LALBA_A; } else { lbppbe = 0; lalba = 0; } if (lbppbe > 0) { dp->stripesize = block_len << lbppbe; dp->stripeoffset = (dp->stripesize - block_len * lalba) % dp->stripesize; } else if (softc->quirks & DA_Q_4K) { dp->stripesize = 4096; dp->stripeoffset = 0; } else if (softc->unmap_gran != 0) { dp->stripesize = block_len * softc->unmap_gran; dp->stripeoffset = (dp->stripesize - block_len * softc->unmap_gran_align) % dp->stripesize; } else { dp->stripesize = 0; dp->stripeoffset = 0; } /* * Have the controller provide us with a geometry * for this disk. The only time the geometry * matters is when we boot and the controller * is the only one knowledgeable enough to come * up with something that will make this a bootable * device. */ xpt_setup_ccb(&ccg.ccb_h, periph->path, CAM_PRIORITY_NORMAL); ccg.ccb_h.func_code = XPT_CALC_GEOMETRY; ccg.block_size = dp->secsize; ccg.volume_size = dp->sectors; ccg.heads = 0; ccg.secs_per_track = 0; ccg.cylinders = 0; xpt_action((union ccb*)&ccg); if ((ccg.ccb_h.status & CAM_STATUS_MASK) != CAM_REQ_CMP) { /* * We don't know what went wrong here- but just pick * a geometry so we don't have nasty things like divide * by zero. */ dp->heads = 255; dp->secs_per_track = 255; dp->cylinders = dp->sectors / (255 * 255); if (dp->cylinders == 0) { dp->cylinders = 1; } } else { dp->heads = ccg.heads; dp->secs_per_track = ccg.secs_per_track; dp->cylinders = ccg.cylinders; } /* * If the user supplied a read capacity buffer, and if it is * different than the previous buffer, update the data in the EDT. * If it's the same, we don't bother. This avoids sending an * update every time someone opens this device. */ if ((rcaplong != NULL) && (bcmp(rcaplong, &softc->rcaplong, min(sizeof(softc->rcaplong), rcap_len)) != 0)) { struct ccb_dev_advinfo cdai; xpt_setup_ccb(&cdai.ccb_h, periph->path, CAM_PRIORITY_NORMAL); cdai.ccb_h.func_code = XPT_DEV_ADVINFO; cdai.buftype = CDAI_TYPE_RCAPLONG; cdai.flags = CDAI_FLAG_STORE; cdai.bufsiz = rcap_len; cdai.buf = (uint8_t *)rcaplong; xpt_action((union ccb *)&cdai); if ((cdai.ccb_h.status & CAM_DEV_QFRZN) != 0) cam_release_devq(cdai.ccb_h.path, 0, 0, 0, FALSE); if (cdai.ccb_h.status != CAM_REQ_CMP) { xpt_print(periph->path, "%s: failed to set read " "capacity advinfo\n", __func__); /* Use cam_error_print() to decode the status */ cam_error_print((union ccb *)&cdai, CAM_ESF_CAM_STATUS, CAM_EPF_ALL); } else { bcopy(rcaplong, &softc->rcaplong, min(sizeof(softc->rcaplong), rcap_len)); } } softc->disk->d_sectorsize = softc->params.secsize; softc->disk->d_mediasize = softc->params.secsize * (off_t)softc->params.sectors; softc->disk->d_stripesize = softc->params.stripesize; softc->disk->d_stripeoffset = softc->params.stripeoffset; /* XXX: these are not actually "firmware" values, so they may be wrong */ softc->disk->d_fwsectors = softc->params.secs_per_track; softc->disk->d_fwheads = softc->params.heads; softc->disk->d_devstat->block_size = softc->params.secsize; softc->disk->d_devstat->flags &= ~DEVSTAT_BS_UNAVAILABLE; error = disk_resize(softc->disk, M_NOWAIT); if (error != 0) xpt_print(periph->path, "disk_resize(9) failed, error = %d\n", error); } static void dasendorderedtag(void *arg) { struct da_softc *softc = arg; if (da_send_ordered) { if (!LIST_EMPTY(&softc->pending_ccbs)) { if ((softc->flags & DA_FLAG_WAS_OTAG) == 0) softc->flags |= DA_FLAG_NEED_OTAG; softc->flags &= ~DA_FLAG_WAS_OTAG; } } /* Queue us up again */ callout_reset(&softc->sendordered_c, (da_default_timeout * hz) / DA_ORDEREDTAG_INTERVAL, dasendorderedtag, softc); } /* * Step through all DA peripheral drivers, and if the device is still open, * sync the disk cache to physical media. */ static void dashutdown(void * arg, int howto) { struct cam_periph *periph; struct da_softc *softc; union ccb *ccb; int error; CAM_PERIPH_FOREACH(periph, &dadriver) { softc = (struct da_softc *)periph->softc; if (SCHEDULER_STOPPED()) { /* If we paniced with the lock held, do not recurse. */ if (!cam_periph_owned(periph) && (softc->flags & DA_FLAG_OPEN)) { dadump(softc->disk, NULL, 0, 0, 0); } continue; } cam_periph_lock(periph); /* * We only sync the cache if the drive is still open, and * if the drive is capable of it.. */ if (((softc->flags & DA_FLAG_OPEN) == 0) || (softc->quirks & DA_Q_NO_SYNC_CACHE)) { cam_periph_unlock(periph); continue; } ccb = cam_periph_getccb(periph, CAM_PRIORITY_NORMAL); scsi_synchronize_cache(&ccb->csio, /*retries*/0, /*cbfcnp*/dadone, MSG_SIMPLE_Q_TAG, /*begin_lba*/0, /* whole disk */ /*lb_count*/0, SSD_FULL_SIZE, 60 * 60 * 1000); error = cam_periph_runccb(ccb, daerror, /*cam_flags*/0, /*sense_flags*/ SF_NO_RECOVERY | SF_NO_RETRY | SF_QUIET_IR, softc->disk->d_devstat); if (error != 0) xpt_print(periph->path, "Synchronize cache failed\n"); xpt_release_ccb(ccb); cam_periph_unlock(periph); } } #else /* !_KERNEL */ /* * XXX These are only left out of the kernel build to silence warnings. If, * for some reason these functions are used in the kernel, the ifdefs should * be moved so they are included both in the kernel and userland. */ void scsi_format_unit(struct ccb_scsiio *csio, u_int32_t retries, void (*cbfcnp)(struct cam_periph *, union ccb *), u_int8_t tag_action, u_int8_t byte2, u_int16_t ileave, u_int8_t *data_ptr, u_int32_t dxfer_len, u_int8_t sense_len, u_int32_t timeout) { struct scsi_format_unit *scsi_cmd; scsi_cmd = (struct scsi_format_unit *)&csio->cdb_io.cdb_bytes; scsi_cmd->opcode = FORMAT_UNIT; scsi_cmd->byte2 = byte2; scsi_ulto2b(ileave, scsi_cmd->interleave); cam_fill_csio(csio, retries, cbfcnp, /*flags*/ (dxfer_len > 0) ? CAM_DIR_OUT : CAM_DIR_NONE, tag_action, data_ptr, dxfer_len, sense_len, sizeof(*scsi_cmd), timeout); } void scsi_read_defects(struct ccb_scsiio *csio, uint32_t retries, void (*cbfcnp)(struct cam_periph *, union ccb *), uint8_t tag_action, uint8_t list_format, uint32_t addr_desc_index, uint8_t *data_ptr, uint32_t dxfer_len, int minimum_cmd_size, uint8_t sense_len, uint32_t timeout) { uint8_t cdb_len; /* * These conditions allow using the 10 byte command. Otherwise we * need to use the 12 byte command. */ if ((minimum_cmd_size <= 10) && (addr_desc_index == 0) && (dxfer_len <= SRDD10_MAX_LENGTH)) { struct scsi_read_defect_data_10 *cdb10; cdb10 = (struct scsi_read_defect_data_10 *) &csio->cdb_io.cdb_bytes; cdb_len = sizeof(*cdb10); bzero(cdb10, cdb_len); cdb10->opcode = READ_DEFECT_DATA_10; cdb10->format = list_format; scsi_ulto2b(dxfer_len, cdb10->alloc_length); } else { struct scsi_read_defect_data_12 *cdb12; cdb12 = (struct scsi_read_defect_data_12 *) &csio->cdb_io.cdb_bytes; cdb_len = sizeof(*cdb12); bzero(cdb12, cdb_len); cdb12->opcode = READ_DEFECT_DATA_12; cdb12->format = list_format; scsi_ulto4b(dxfer_len, cdb12->alloc_length); scsi_ulto4b(addr_desc_index, cdb12->address_descriptor_index); } cam_fill_csio(csio, retries, cbfcnp, /*flags*/ CAM_DIR_IN, tag_action, data_ptr, dxfer_len, sense_len, cdb_len, timeout); } void scsi_sanitize(struct ccb_scsiio *csio, u_int32_t retries, void (*cbfcnp)(struct cam_periph *, union ccb *), u_int8_t tag_action, u_int8_t byte2, u_int16_t control, u_int8_t *data_ptr, u_int32_t dxfer_len, u_int8_t sense_len, u_int32_t timeout) { struct scsi_sanitize *scsi_cmd; scsi_cmd = (struct scsi_sanitize *)&csio->cdb_io.cdb_bytes; scsi_cmd->opcode = SANITIZE; scsi_cmd->byte2 = byte2; scsi_cmd->control = control; scsi_ulto2b(dxfer_len, scsi_cmd->length); cam_fill_csio(csio, retries, cbfcnp, /*flags*/ (dxfer_len > 0) ? CAM_DIR_OUT : CAM_DIR_NONE, tag_action, data_ptr, dxfer_len, sense_len, sizeof(*scsi_cmd), timeout); } #endif /* _KERNEL */ void scsi_zbc_out(struct ccb_scsiio *csio, uint32_t retries, void (*cbfcnp)(struct cam_periph *, union ccb *), uint8_t tag_action, uint8_t service_action, uint64_t zone_id, uint8_t zone_flags, uint8_t *data_ptr, uint32_t dxfer_len, uint8_t sense_len, uint32_t timeout) { struct scsi_zbc_out *scsi_cmd; scsi_cmd = (struct scsi_zbc_out *)&csio->cdb_io.cdb_bytes; scsi_cmd->opcode = ZBC_OUT; scsi_cmd->service_action = service_action; scsi_u64to8b(zone_id, scsi_cmd->zone_id); scsi_cmd->zone_flags = zone_flags; cam_fill_csio(csio, retries, cbfcnp, /*flags*/ (dxfer_len > 0) ? CAM_DIR_OUT : CAM_DIR_NONE, tag_action, data_ptr, dxfer_len, sense_len, sizeof(*scsi_cmd), timeout); } void scsi_zbc_in(struct ccb_scsiio *csio, uint32_t retries, void (*cbfcnp)(struct cam_periph *, union ccb *), uint8_t tag_action, uint8_t service_action, uint64_t zone_start_lba, uint8_t zone_options, uint8_t *data_ptr, uint32_t dxfer_len, uint8_t sense_len, uint32_t timeout) { struct scsi_zbc_in *scsi_cmd; scsi_cmd = (struct scsi_zbc_in *)&csio->cdb_io.cdb_bytes; scsi_cmd->opcode = ZBC_IN; scsi_cmd->service_action = service_action; scsi_ulto4b(dxfer_len, scsi_cmd->length); scsi_u64to8b(zone_start_lba, scsi_cmd->zone_start_lba); scsi_cmd->zone_options = zone_options; cam_fill_csio(csio, retries, cbfcnp, /*flags*/ (dxfer_len > 0) ? CAM_DIR_IN : CAM_DIR_NONE, tag_action, data_ptr, dxfer_len, sense_len, sizeof(*scsi_cmd), timeout); } int scsi_ata_zac_mgmt_out(struct ccb_scsiio *csio, uint32_t retries, void (*cbfcnp)(struct cam_periph *, union ccb *), uint8_t tag_action, int use_ncq, uint8_t zm_action, uint64_t zone_id, uint8_t zone_flags, uint8_t *data_ptr, uint32_t dxfer_len, uint8_t *cdb_storage, size_t cdb_storage_len, uint8_t sense_len, uint32_t timeout) { uint8_t command_out, protocol, ata_flags; uint16_t features_out; uint32_t sectors_out, auxiliary; int retval; retval = 0; if (use_ncq == 0) { command_out = ATA_ZAC_MANAGEMENT_OUT; features_out = (zm_action & 0xf) | (zone_flags << 8); ata_flags = AP_FLAG_BYT_BLOK_BLOCKS; if (dxfer_len == 0) { protocol = AP_PROTO_NON_DATA; ata_flags |= AP_FLAG_TLEN_NO_DATA; sectors_out = 0; } else { protocol = AP_PROTO_DMA; ata_flags |= AP_FLAG_TLEN_SECT_CNT | AP_FLAG_TDIR_TO_DEV; sectors_out = ((dxfer_len >> 9) & 0xffff); } auxiliary = 0; } else { ata_flags = AP_FLAG_BYT_BLOK_BLOCKS; if (dxfer_len == 0) { command_out = ATA_NCQ_NON_DATA; features_out = ATA_NCQ_ZAC_MGMT_OUT; /* * We're assuming the SCSI to ATA translation layer * will set the NCQ tag number in the tag field. * That isn't clear from the SAT-4 spec (as of rev 05). */ sectors_out = 0; ata_flags |= AP_FLAG_TLEN_NO_DATA; } else { command_out = ATA_SEND_FPDMA_QUEUED; /* * Note that we're defaulting to normal priority, * and assuming that the SCSI to ATA translation * layer will insert the NCQ tag number in the tag * field. That isn't clear in the SAT-4 spec (as * of rev 05). */ sectors_out = ATA_SFPDMA_ZAC_MGMT_OUT << 8; ata_flags |= AP_FLAG_TLEN_FEAT | AP_FLAG_TDIR_TO_DEV; /* * For SEND FPDMA QUEUED, the transfer length is * encoded in the FEATURE register, and 0 means * that 65536 512 byte blocks are to be tranferred. * In practice, it seems unlikely that we'll see * a transfer that large, and it may confuse the * the SAT layer, because generally that means that * 0 bytes should be transferred. */ if (dxfer_len == (65536 * 512)) { features_out = 0; } else if (dxfer_len <= (65535 * 512)) { features_out = ((dxfer_len >> 9) & 0xffff); } else { /* The transfer is too big. */ retval = 1; goto bailout; } } auxiliary = (zm_action & 0xf) | (zone_flags << 8); protocol = AP_PROTO_FPDMA; } protocol |= AP_EXTEND; retval = scsi_ata_pass(csio, retries, cbfcnp, /*flags*/ (dxfer_len > 0) ? CAM_DIR_OUT : CAM_DIR_NONE, tag_action, /*protocol*/ protocol, /*ata_flags*/ ata_flags, /*features*/ features_out, /*sector_count*/ sectors_out, /*lba*/ zone_id, /*command*/ command_out, /*device*/ 0, /*icc*/ 0, /*auxiliary*/ auxiliary, /*control*/ 0, /*data_ptr*/ data_ptr, /*dxfer_len*/ dxfer_len, /*cdb_storage*/ cdb_storage, /*cdb_storage_len*/ cdb_storage_len, /*minimum_cmd_size*/ 0, /*sense_len*/ SSD_FULL_SIZE, /*timeout*/ timeout); bailout: return (retval); } int scsi_ata_zac_mgmt_in(struct ccb_scsiio *csio, uint32_t retries, void (*cbfcnp)(struct cam_periph *, union ccb *), uint8_t tag_action, int use_ncq, uint8_t zm_action, uint64_t zone_id, uint8_t zone_flags, uint8_t *data_ptr, uint32_t dxfer_len, uint8_t *cdb_storage, size_t cdb_storage_len, uint8_t sense_len, uint32_t timeout) { uint8_t command_out, protocol; uint16_t features_out, sectors_out; uint32_t auxiliary; int ata_flags; int retval; retval = 0; ata_flags = AP_FLAG_TDIR_FROM_DEV | AP_FLAG_BYT_BLOK_BLOCKS; if (use_ncq == 0) { command_out = ATA_ZAC_MANAGEMENT_IN; /* XXX KDM put a macro here */ features_out = (zm_action & 0xf) | (zone_flags << 8); sectors_out = dxfer_len >> 9; /* XXX KDM macro */ protocol = AP_PROTO_DMA; ata_flags |= AP_FLAG_TLEN_SECT_CNT; auxiliary = 0; } else { ata_flags |= AP_FLAG_TLEN_FEAT; command_out = ATA_RECV_FPDMA_QUEUED; sectors_out = ATA_RFPDMA_ZAC_MGMT_IN << 8; /* * For RECEIVE FPDMA QUEUED, the transfer length is * encoded in the FEATURE register, and 0 means * that 65536 512 byte blocks are to be tranferred. * In practice, it seems unlikely that we'll see * a transfer that large, and it may confuse the * the SAT layer, because generally that means that * 0 bytes should be transferred. */ if (dxfer_len == (65536 * 512)) { features_out = 0; } else if (dxfer_len <= (65535 * 512)) { features_out = ((dxfer_len >> 9) & 0xffff); } else { /* The transfer is too big. */ retval = 1; goto bailout; } auxiliary = (zm_action & 0xf) | (zone_flags << 8), protocol = AP_PROTO_FPDMA; } protocol |= AP_EXTEND; retval = scsi_ata_pass(csio, retries, cbfcnp, /*flags*/ CAM_DIR_IN, tag_action, /*protocol*/ protocol, /*ata_flags*/ ata_flags, /*features*/ features_out, /*sector_count*/ sectors_out, /*lba*/ zone_id, /*command*/ command_out, /*device*/ 0, /*icc*/ 0, /*auxiliary*/ auxiliary, /*control*/ 0, /*data_ptr*/ data_ptr, /*dxfer_len*/ (dxfer_len >> 9) * 512, /* XXX KDM */ /*cdb_storage*/ cdb_storage, /*cdb_storage_len*/ cdb_storage_len, /*minimum_cmd_size*/ 0, /*sense_len*/ SSD_FULL_SIZE, /*timeout*/ timeout); bailout: return (retval); } Index: head/sys/dev/nvme/nvme.h =================================================================== --- head/sys/dev/nvme/nvme.h (revision 329818) +++ head/sys/dev/nvme/nvme.h (revision 329819) @@ -1,1136 +1,1133 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (C) 2012-2013 Intel Corporation * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef __NVME_H__ #define __NVME_H__ #ifdef _KERNEL #include #endif #include #define NVME_PASSTHROUGH_CMD _IOWR('n', 0, struct nvme_pt_command) #define NVME_RESET_CONTROLLER _IO('n', 1) #define NVME_IO_TEST _IOWR('n', 100, struct nvme_io_test) #define NVME_BIO_TEST _IOWR('n', 101, struct nvme_io_test) /* * Macros to deal with NVME revisions, as defined VS register */ #define NVME_REV(x, y) (((x) << 16) | ((y) << 8)) #define NVME_MAJOR(r) (((r) >> 16) & 0xffff) #define NVME_MINOR(r) (((r) >> 8) & 0xff) /* * Use to mark a command to apply to all namespaces, or to retrieve global * log pages. */ #define NVME_GLOBAL_NAMESPACE_TAG ((uint32_t)0xFFFFFFFF) /* Cap nvme to 1MB transfers driver explodes with larger sizes */ #define NVME_MAX_XFER_SIZE (MAXPHYS < (1<<20) ? MAXPHYS : (1<<20)) -/* Largest DSM Trim that can be done */ -#define NVME_MAX_DSM_TRIM 4096 - union cap_lo_register { uint32_t raw; struct { /** maximum queue entries supported */ uint32_t mqes : 16; /** contiguous queues required */ uint32_t cqr : 1; /** arbitration mechanism supported */ uint32_t ams : 2; uint32_t reserved1 : 5; /** timeout */ uint32_t to : 8; } bits __packed; } __packed; _Static_assert(sizeof(union cap_lo_register) == 4, "bad size for cap_lo_register"); union cap_hi_register { uint32_t raw; struct { /** doorbell stride */ uint32_t dstrd : 4; uint32_t reserved3 : 1; /** command sets supported */ uint32_t css_nvm : 1; uint32_t css_reserved : 3; uint32_t reserved2 : 7; /** memory page size minimum */ uint32_t mpsmin : 4; /** memory page size maximum */ uint32_t mpsmax : 4; uint32_t reserved1 : 8; } bits __packed; } __packed; _Static_assert(sizeof(union cap_hi_register) == 4, "bad size of cap_hi_register"); union cc_register { uint32_t raw; struct { /** enable */ uint32_t en : 1; uint32_t reserved1 : 3; /** i/o command set selected */ uint32_t css : 3; /** memory page size */ uint32_t mps : 4; /** arbitration mechanism selected */ uint32_t ams : 3; /** shutdown notification */ uint32_t shn : 2; /** i/o submission queue entry size */ uint32_t iosqes : 4; /** i/o completion queue entry size */ uint32_t iocqes : 4; uint32_t reserved2 : 8; } bits __packed; } __packed; _Static_assert(sizeof(union cc_register) == 4, "bad size for cc_register"); enum shn_value { NVME_SHN_NORMAL = 0x1, NVME_SHN_ABRUPT = 0x2, }; union csts_register { uint32_t raw; struct { /** ready */ uint32_t rdy : 1; /** controller fatal status */ uint32_t cfs : 1; /** shutdown status */ uint32_t shst : 2; uint32_t reserved1 : 28; } bits __packed; } __packed; _Static_assert(sizeof(union csts_register) == 4, "bad size for csts_register"); enum shst_value { NVME_SHST_NORMAL = 0x0, NVME_SHST_OCCURRING = 0x1, NVME_SHST_COMPLETE = 0x2, }; union aqa_register { uint32_t raw; struct { /** admin submission queue size */ uint32_t asqs : 12; uint32_t reserved1 : 4; /** admin completion queue size */ uint32_t acqs : 12; uint32_t reserved2 : 4; } bits __packed; } __packed; _Static_assert(sizeof(union aqa_register) == 4, "bad size for aqa_resgister"); struct nvme_registers { /** controller capabilities */ union cap_lo_register cap_lo; union cap_hi_register cap_hi; uint32_t vs; /* version */ uint32_t intms; /* interrupt mask set */ uint32_t intmc; /* interrupt mask clear */ /** controller configuration */ union cc_register cc; uint32_t reserved1; /** controller status */ union csts_register csts; uint32_t reserved2; /** admin queue attributes */ union aqa_register aqa; uint64_t asq; /* admin submission queue base addr */ uint64_t acq; /* admin completion queue base addr */ uint32_t reserved3[0x3f2]; struct { uint32_t sq_tdbl; /* submission queue tail doorbell */ uint32_t cq_hdbl; /* completion queue head doorbell */ } doorbell[1] __packed; } __packed; _Static_assert(sizeof(struct nvme_registers) == 0x1008, "bad size for nvme_registers"); struct nvme_command { /* dword 0 */ uint16_t opc : 8; /* opcode */ uint16_t fuse : 2; /* fused operation */ uint16_t rsvd1 : 6; uint16_t cid; /* command identifier */ /* dword 1 */ uint32_t nsid; /* namespace identifier */ /* dword 2-3 */ uint32_t rsvd2; uint32_t rsvd3; /* dword 4-5 */ uint64_t mptr; /* metadata pointer */ /* dword 6-7 */ uint64_t prp1; /* prp entry 1 */ /* dword 8-9 */ uint64_t prp2; /* prp entry 2 */ /* dword 10-15 */ uint32_t cdw10; /* command-specific */ uint32_t cdw11; /* command-specific */ uint32_t cdw12; /* command-specific */ uint32_t cdw13; /* command-specific */ uint32_t cdw14; /* command-specific */ uint32_t cdw15; /* command-specific */ } __packed; _Static_assert(sizeof(struct nvme_command) == 16 * 4, "bad size for nvme_command"); struct nvme_status { uint16_t p : 1; /* phase tag */ uint16_t sc : 8; /* status code */ uint16_t sct : 3; /* status code type */ uint16_t rsvd2 : 2; uint16_t m : 1; /* more */ uint16_t dnr : 1; /* do not retry */ } __packed; _Static_assert(sizeof(struct nvme_status) == 2, "bad size for nvme_status"); struct nvme_completion { /* dword 0 */ uint32_t cdw0; /* command-specific */ /* dword 1 */ uint32_t rsvd1; /* dword 2 */ uint16_t sqhd; /* submission queue head pointer */ uint16_t sqid; /* submission queue identifier */ /* dword 3 */ uint16_t cid; /* command identifier */ struct nvme_status status; } __packed; _Static_assert(sizeof(struct nvme_completion) == 4 * 4, "bad size for nvme_completion"); struct nvme_dsm_range { uint32_t attributes; uint32_t length; uint64_t starting_lba; } __packed; _Static_assert(sizeof(struct nvme_dsm_range) == 16, "bad size for nvme_dsm_ranage"); /* status code types */ enum nvme_status_code_type { NVME_SCT_GENERIC = 0x0, NVME_SCT_COMMAND_SPECIFIC = 0x1, NVME_SCT_MEDIA_ERROR = 0x2, /* 0x3-0x6 - reserved */ NVME_SCT_VENDOR_SPECIFIC = 0x7, }; /* generic command status codes */ enum nvme_generic_command_status_code { NVME_SC_SUCCESS = 0x00, NVME_SC_INVALID_OPCODE = 0x01, NVME_SC_INVALID_FIELD = 0x02, NVME_SC_COMMAND_ID_CONFLICT = 0x03, NVME_SC_DATA_TRANSFER_ERROR = 0x04, NVME_SC_ABORTED_POWER_LOSS = 0x05, NVME_SC_INTERNAL_DEVICE_ERROR = 0x06, NVME_SC_ABORTED_BY_REQUEST = 0x07, NVME_SC_ABORTED_SQ_DELETION = 0x08, NVME_SC_ABORTED_FAILED_FUSED = 0x09, NVME_SC_ABORTED_MISSING_FUSED = 0x0a, NVME_SC_INVALID_NAMESPACE_OR_FORMAT = 0x0b, NVME_SC_COMMAND_SEQUENCE_ERROR = 0x0c, NVME_SC_LBA_OUT_OF_RANGE = 0x80, NVME_SC_CAPACITY_EXCEEDED = 0x81, NVME_SC_NAMESPACE_NOT_READY = 0x82, }; /* command specific status codes */ enum nvme_command_specific_status_code { NVME_SC_COMPLETION_QUEUE_INVALID = 0x00, NVME_SC_INVALID_QUEUE_IDENTIFIER = 0x01, NVME_SC_MAXIMUM_QUEUE_SIZE_EXCEEDED = 0x02, NVME_SC_ABORT_COMMAND_LIMIT_EXCEEDED = 0x03, /* 0x04 - reserved */ NVME_SC_ASYNC_EVENT_REQUEST_LIMIT_EXCEEDED = 0x05, NVME_SC_INVALID_FIRMWARE_SLOT = 0x06, NVME_SC_INVALID_FIRMWARE_IMAGE = 0x07, NVME_SC_INVALID_INTERRUPT_VECTOR = 0x08, NVME_SC_INVALID_LOG_PAGE = 0x09, NVME_SC_INVALID_FORMAT = 0x0a, NVME_SC_FIRMWARE_REQUIRES_RESET = 0x0b, NVME_SC_CONFLICTING_ATTRIBUTES = 0x80, NVME_SC_INVALID_PROTECTION_INFO = 0x81, NVME_SC_ATTEMPTED_WRITE_TO_RO_PAGE = 0x82, }; /* media error status codes */ enum nvme_media_error_status_code { NVME_SC_WRITE_FAULTS = 0x80, NVME_SC_UNRECOVERED_READ_ERROR = 0x81, NVME_SC_GUARD_CHECK_ERROR = 0x82, NVME_SC_APPLICATION_TAG_CHECK_ERROR = 0x83, NVME_SC_REFERENCE_TAG_CHECK_ERROR = 0x84, NVME_SC_COMPARE_FAILURE = 0x85, NVME_SC_ACCESS_DENIED = 0x86, }; /* admin opcodes */ enum nvme_admin_opcode { NVME_OPC_DELETE_IO_SQ = 0x00, NVME_OPC_CREATE_IO_SQ = 0x01, NVME_OPC_GET_LOG_PAGE = 0x02, /* 0x03 - reserved */ NVME_OPC_DELETE_IO_CQ = 0x04, NVME_OPC_CREATE_IO_CQ = 0x05, NVME_OPC_IDENTIFY = 0x06, /* 0x07 - reserved */ NVME_OPC_ABORT = 0x08, NVME_OPC_SET_FEATURES = 0x09, NVME_OPC_GET_FEATURES = 0x0a, /* 0x0b - reserved */ NVME_OPC_ASYNC_EVENT_REQUEST = 0x0c, NVME_OPC_NAMESPACE_MANAGEMENT = 0x0d, /* 0x0e-0x0f - reserved */ NVME_OPC_FIRMWARE_ACTIVATE = 0x10, NVME_OPC_FIRMWARE_IMAGE_DOWNLOAD = 0x11, NVME_OPC_NAMESPACE_ATTACHMENT = 0x15, NVME_OPC_FORMAT_NVM = 0x80, NVME_OPC_SECURITY_SEND = 0x81, NVME_OPC_SECURITY_RECEIVE = 0x82, }; /* nvme nvm opcodes */ enum nvme_nvm_opcode { NVME_OPC_FLUSH = 0x00, NVME_OPC_WRITE = 0x01, NVME_OPC_READ = 0x02, /* 0x03 - reserved */ NVME_OPC_WRITE_UNCORRECTABLE = 0x04, NVME_OPC_COMPARE = 0x05, /* 0x06-0x07 - reserved */ NVME_OPC_DATASET_MANAGEMENT = 0x09, }; enum nvme_feature { /* 0x00 - reserved */ NVME_FEAT_ARBITRATION = 0x01, NVME_FEAT_POWER_MANAGEMENT = 0x02, NVME_FEAT_LBA_RANGE_TYPE = 0x03, NVME_FEAT_TEMPERATURE_THRESHOLD = 0x04, NVME_FEAT_ERROR_RECOVERY = 0x05, NVME_FEAT_VOLATILE_WRITE_CACHE = 0x06, NVME_FEAT_NUMBER_OF_QUEUES = 0x07, NVME_FEAT_INTERRUPT_COALESCING = 0x08, NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION = 0x09, NVME_FEAT_WRITE_ATOMICITY = 0x0A, NVME_FEAT_ASYNC_EVENT_CONFIGURATION = 0x0B, NVME_FEAT_AUTONOMOUS_POWER_STATE_TRANSITION = 0x0C, NVME_FEAT_HOST_MEMORY_BUFFER = 0x0D, NVME_FEAT_TIMESTAMP = 0x0E, NVME_FEAT_KEEP_ALIVE_TIMER = 0x0F, NVME_FEAT_HOST_CONTROLLED_THERMAL_MGMT = 0x10, NVME_FEAT_NON_OP_POWER_STATE_CONFIG = 0x11, /* 0x12-0x77 - reserved */ /* 0x78-0x7f - NVMe Management Interface */ NVME_FEAT_SOFTWARE_PROGRESS_MARKER = 0x80, /* 0x81-0xBF - command set specific (reserved) */ /* 0xC0-0xFF - vendor specific */ }; enum nvme_dsm_attribute { NVME_DSM_ATTR_INTEGRAL_READ = 0x1, NVME_DSM_ATTR_INTEGRAL_WRITE = 0x2, NVME_DSM_ATTR_DEALLOCATE = 0x4, }; enum nvme_activate_action { NVME_AA_REPLACE_NO_ACTIVATE = 0x0, NVME_AA_REPLACE_ACTIVATE = 0x1, NVME_AA_ACTIVATE = 0x2, }; struct nvme_power_state { /** Maximum Power */ uint16_t mp; /* Maximum Power */ uint8_t ps_rsvd1; uint8_t mps : 1; /* Max Power Scale */ uint8_t nops : 1; /* Non-Operational State */ uint8_t ps_rsvd2 : 6; uint32_t enlat; /* Entry Latency */ uint32_t exlat; /* Exit Latency */ uint8_t rrt : 5; /* Relative Read Throughput */ uint8_t ps_rsvd3 : 3; uint8_t rrl : 5; /* Relative Read Latency */ uint8_t ps_rsvd4 : 3; uint8_t rwt : 5; /* Relative Write Throughput */ uint8_t ps_rsvd5 : 3; uint8_t rwl : 5; /* Relative Write Latency */ uint8_t ps_rsvd6 : 3; uint16_t idlp; /* Idle Power */ uint8_t ps_rsvd7 : 6; uint8_t ips : 2; /* Idle Power Scale */ uint8_t ps_rsvd8; uint16_t actp; /* Active Power */ uint8_t apw : 3; /* Active Power Workload */ uint8_t ps_rsvd9 : 3; uint8_t aps : 2; /* Active Power Scale */ uint8_t ps_rsvd10[9]; } __packed; _Static_assert(sizeof(struct nvme_power_state) == 32, "bad size for nvme_power_state"); #define NVME_SERIAL_NUMBER_LENGTH 20 #define NVME_MODEL_NUMBER_LENGTH 40 #define NVME_FIRMWARE_REVISION_LENGTH 8 struct nvme_controller_data { /* bytes 0-255: controller capabilities and features */ /** pci vendor id */ uint16_t vid; /** pci subsystem vendor id */ uint16_t ssvid; /** serial number */ uint8_t sn[NVME_SERIAL_NUMBER_LENGTH]; /** model number */ uint8_t mn[NVME_MODEL_NUMBER_LENGTH]; /** firmware revision */ uint8_t fr[NVME_FIRMWARE_REVISION_LENGTH]; /** recommended arbitration burst */ uint8_t rab; /** ieee oui identifier */ uint8_t ieee[3]; /** multi-interface capabilities */ uint8_t mic; /** maximum data transfer size */ uint8_t mdts; /** Controller ID */ uint16_t ctrlr_id; /** Version */ uint32_t ver; /** RTD3 Resume Latency */ uint32_t rtd3r; /** RTD3 Enter Latency */ uint32_t rtd3e; /** Optional Asynchronous Events Supported */ uint32_t oaes; /* bitfield really */ /** Controller Attributes */ uint32_t ctratt; /* bitfield really */ uint8_t reserved1[12]; /** FRU Globally Unique Identifier */ uint8_t fguid[16]; uint8_t reserved2[128]; /* bytes 256-511: admin command set attributes */ /** optional admin command support */ struct { /* supports security send/receive commands */ uint16_t security : 1; /* supports format nvm command */ uint16_t format : 1; /* supports firmware activate/download commands */ uint16_t firmware : 1; /* supports namespace management commands */ uint16_t nsmgmt : 1; uint16_t oacs_rsvd : 12; } __packed oacs; /** abort command limit */ uint8_t acl; /** asynchronous event request limit */ uint8_t aerl; /** firmware updates */ struct { /* first slot is read-only */ uint8_t slot1_ro : 1; /* number of firmware slots */ uint8_t num_slots : 3; uint8_t frmw_rsvd : 4; } __packed frmw; /** log page attributes */ struct { /* per namespace smart/health log page */ uint8_t ns_smart : 1; uint8_t lpa_rsvd : 7; } __packed lpa; /** error log page entries */ uint8_t elpe; /** number of power states supported */ uint8_t npss; /** admin vendor specific command configuration */ struct { /* admin vendor specific commands use spec format */ uint8_t spec_format : 1; uint8_t avscc_rsvd : 7; } __packed avscc; /** Autonomous Power State Transition Attributes */ struct { /* Autonmous Power State Transitions supported */ uint8_t apst_supp : 1; uint8_t apsta_rsvd : 7; } __packed apsta; /** Warning Composite Temperature Threshold */ uint16_t wctemp; /** Critical Composite Temperature Threshold */ uint16_t cctemp; /** Maximum Time for Firmware Activation */ uint16_t mtfa; /** Host Memory Buffer Preferred Size */ uint32_t hmpre; /** Host Memory Buffer Minimum Size */ uint32_t hmmin; /** Name space capabilities */ struct { /* if nsmgmt, report tnvmcap and unvmcap */ uint8_t tnvmcap[16]; uint8_t unvmcap[16]; } __packed untncap; /** Replay Protected Memory Block Support */ uint32_t rpmbs; /* Really a bitfield */ /** Extended Device Self-test Time */ uint16_t edstt; /** Device Self-test Options */ uint8_t dsto; /* Really a bitfield */ /** Firmware Update Granularity */ uint8_t fwug; /** Keep Alive Support */ uint16_t kas; /** Host Controlled Thermal Management Attributes */ uint16_t hctma; /* Really a bitfield */ /** Minimum Thermal Management Temperature */ uint16_t mntmt; /** Maximum Thermal Management Temperature */ uint16_t mxtmt; /** Sanitize Capabilities */ uint32_t sanicap; /* Really a bitfield */ uint8_t reserved3[180]; /* bytes 512-703: nvm command set attributes */ /** submission queue entry size */ struct { uint8_t min : 4; uint8_t max : 4; } __packed sqes; /** completion queue entry size */ struct { uint8_t min : 4; uint8_t max : 4; } __packed cqes; /** Maximum Outstanding Commands */ uint16_t maxcmd; /** number of namespaces */ uint32_t nn; /** optional nvm command support */ struct { uint16_t compare : 1; uint16_t write_unc : 1; uint16_t dsm: 1; uint16_t reserved: 13; } __packed oncs; /** fused operation support */ uint16_t fuses; /** format nvm attributes */ uint8_t fna; /** volatile write cache */ struct { uint8_t present : 1; uint8_t reserved : 7; } __packed vwc; /* TODO: flesh out remaining nvm command set attributes */ uint8_t reserved5[178]; /* bytes 704-2047: i/o command set attributes */ uint8_t reserved6[1344]; /* bytes 2048-3071: power state descriptors */ struct nvme_power_state power_state[32]; /* bytes 3072-4095: vendor specific */ uint8_t vs[1024]; } __packed __aligned(4); _Static_assert(sizeof(struct nvme_controller_data) == 4096, "bad size for nvme_controller_data"); struct nvme_namespace_data { /** namespace size */ uint64_t nsze; /** namespace capacity */ uint64_t ncap; /** namespace utilization */ uint64_t nuse; /** namespace features */ struct { /** thin provisioning */ uint8_t thin_prov : 1; uint8_t reserved1 : 7; } __packed nsfeat; /** number of lba formats */ uint8_t nlbaf; /** formatted lba size */ struct { uint8_t format : 4; uint8_t extended : 1; uint8_t reserved2 : 3; } __packed flbas; /** metadata capabilities */ struct { /* metadata can be transferred as part of data prp list */ uint8_t extended : 1; /* metadata can be transferred with separate metadata pointer */ uint8_t pointer : 1; uint8_t reserved3 : 6; } __packed mc; /** end-to-end data protection capabilities */ struct { /* protection information type 1 */ uint8_t pit1 : 1; /* protection information type 2 */ uint8_t pit2 : 1; /* protection information type 3 */ uint8_t pit3 : 1; /* first eight bytes of metadata */ uint8_t md_start : 1; /* last eight bytes of metadata */ uint8_t md_end : 1; } __packed dpc; /** end-to-end data protection type settings */ struct { /* protection information type */ uint8_t pit : 3; /* 1 == protection info transferred at start of metadata */ /* 0 == protection info transferred at end of metadata */ uint8_t md_start : 1; uint8_t reserved4 : 4; } __packed dps; uint8_t reserved5[98]; /** lba format support */ struct { /** metadata size */ uint32_t ms : 16; /** lba data size */ uint32_t lbads : 8; /** relative performance */ uint32_t rp : 2; uint32_t reserved6 : 6; } __packed lbaf[16]; uint8_t reserved6[192]; uint8_t vendor_specific[3712]; } __packed __aligned(4); _Static_assert(sizeof(struct nvme_namespace_data) == 4096, "bad size for nvme_namepsace_data"); enum nvme_log_page { /* 0x00 - reserved */ NVME_LOG_ERROR = 0x01, NVME_LOG_HEALTH_INFORMATION = 0x02, NVME_LOG_FIRMWARE_SLOT = 0x03, NVME_LOG_CHANGED_NAMESPACE = 0x04, NVME_LOG_COMMAND_EFFECT = 0x05, /* 0x06-0x7F - reserved */ /* 0x80-0xBF - I/O command set specific */ NVME_LOG_RES_NOTIFICATION = 0x80, /* 0xC0-0xFF - vendor specific */ /* * The following are Intel Specific log pages, but they seem * to be widely implemented. */ INTEL_LOG_READ_LAT_LOG = 0xc1, INTEL_LOG_WRITE_LAT_LOG = 0xc2, INTEL_LOG_TEMP_STATS = 0xc5, INTEL_LOG_ADD_SMART = 0xca, INTEL_LOG_DRIVE_MKT_NAME = 0xdd, /* * HGST log page, with lots ofs sub pages. */ HGST_INFO_LOG = 0xc1, }; struct nvme_error_information_entry { uint64_t error_count; uint16_t sqid; uint16_t cid; struct nvme_status status; uint16_t error_location; uint64_t lba; uint32_t nsid; uint8_t vendor_specific; uint8_t reserved[35]; } __packed __aligned(4); _Static_assert(sizeof(struct nvme_error_information_entry) == 64, "bad size for nvme_error_information_entry"); union nvme_critical_warning_state { uint8_t raw; struct { uint8_t available_spare : 1; uint8_t temperature : 1; uint8_t device_reliability : 1; uint8_t read_only : 1; uint8_t volatile_memory_backup : 1; uint8_t reserved : 3; } __packed bits; } __packed; _Static_assert(sizeof(union nvme_critical_warning_state) == 1, "bad size for nvme_critical_warning_state"); struct nvme_health_information_page { union nvme_critical_warning_state critical_warning; uint16_t temperature; uint8_t available_spare; uint8_t available_spare_threshold; uint8_t percentage_used; uint8_t reserved[26]; /* * Note that the following are 128-bit values, but are * defined as an array of 2 64-bit values. */ /* Data Units Read is always in 512-byte units. */ uint64_t data_units_read[2]; /* Data Units Written is always in 512-byte units. */ uint64_t data_units_written[2]; /* For NVM command set, this includes Compare commands. */ uint64_t host_read_commands[2]; uint64_t host_write_commands[2]; /* Controller Busy Time is reported in minutes. */ uint64_t controller_busy_time[2]; uint64_t power_cycles[2]; uint64_t power_on_hours[2]; uint64_t unsafe_shutdowns[2]; uint64_t media_errors[2]; uint64_t num_error_info_log_entries[2]; uint32_t warning_temp_time; uint32_t error_temp_time; uint16_t temp_sensor[8]; uint8_t reserved2[296]; } __packed __aligned(4); _Static_assert(sizeof(struct nvme_health_information_page) == 512, "bad size for nvme_health_information_page"); struct nvme_firmware_page { struct { uint8_t slot : 3; /* slot for current FW */ uint8_t reserved : 5; } __packed afi; uint8_t reserved[7]; uint64_t revision[7]; /* revisions for 7 slots */ uint8_t reserved2[448]; } __packed __aligned(4); _Static_assert(sizeof(struct nvme_firmware_page) == 512, "bad size for nvme_firmware_page"); struct intel_log_temp_stats { uint64_t current; uint64_t overtemp_flag_last; uint64_t overtemp_flag_life; uint64_t max_temp; uint64_t min_temp; uint64_t _rsvd[5]; uint64_t max_oper_temp; uint64_t min_oper_temp; uint64_t est_offset; } __packed __aligned(4); _Static_assert(sizeof(struct intel_log_temp_stats) == 13 * 8, "bad size for intel_log_temp_stats"); #define NVME_TEST_MAX_THREADS 128 struct nvme_io_test { enum nvme_nvm_opcode opc; uint32_t size; uint32_t time; /* in seconds */ uint32_t num_threads; uint32_t flags; uint64_t io_completed[NVME_TEST_MAX_THREADS]; }; enum nvme_io_test_flags { /* * Specifies whether dev_refthread/dev_relthread should be * called during NVME_BIO_TEST. Ignored for other test * types. */ NVME_TEST_FLAG_REFTHREAD = 0x1, }; struct nvme_pt_command { /* * cmd is used to specify a passthrough command to a controller or * namespace. * * The following fields from cmd may be specified by the caller: * * opc (opcode) * * nsid (namespace id) - for admin commands only * * cdw10-cdw15 * * Remaining fields must be set to 0 by the caller. */ struct nvme_command cmd; /* * cpl returns completion status for the passthrough command * specified by cmd. * * The following fields will be filled out by the driver, for * consumption by the caller: * * cdw0 * * status (except for phase) * * Remaining fields will be set to 0 by the driver. */ struct nvme_completion cpl; /* buf is the data buffer associated with this passthrough command. */ void * buf; /* * len is the length of the data buffer associated with this * passthrough command. */ uint32_t len; /* * is_read = 1 if the passthrough command will read data into the * supplied buffer from the controller. * * is_read = 0 if the passthrough command will write data from the * supplied buffer to the controller. */ uint32_t is_read; /* * driver_lock is used by the driver only. It must be set to 0 * by the caller. */ struct mtx * driver_lock; }; #define nvme_completion_is_error(cpl) \ ((cpl)->status.sc != 0 || (cpl)->status.sct != 0) void nvme_strvis(uint8_t *dst, const uint8_t *src, int dstlen, int srclen); #ifdef _KERNEL struct bio; struct nvme_namespace; struct nvme_controller; struct nvme_consumer; typedef void (*nvme_cb_fn_t)(void *, const struct nvme_completion *); typedef void *(*nvme_cons_ns_fn_t)(struct nvme_namespace *, void *); typedef void *(*nvme_cons_ctrlr_fn_t)(struct nvme_controller *); typedef void (*nvme_cons_async_fn_t)(void *, const struct nvme_completion *, uint32_t, void *, uint32_t); typedef void (*nvme_cons_fail_fn_t)(void *); enum nvme_namespace_flags { NVME_NS_DEALLOCATE_SUPPORTED = 0x1, NVME_NS_FLUSH_SUPPORTED = 0x2, }; int nvme_ctrlr_passthrough_cmd(struct nvme_controller *ctrlr, struct nvme_pt_command *pt, uint32_t nsid, int is_user_buffer, int is_admin_cmd); /* Admin functions */ void nvme_ctrlr_cmd_set_feature(struct nvme_controller *ctrlr, uint8_t feature, uint32_t cdw11, void *payload, uint32_t payload_size, nvme_cb_fn_t cb_fn, void *cb_arg); void nvme_ctrlr_cmd_get_feature(struct nvme_controller *ctrlr, uint8_t feature, uint32_t cdw11, void *payload, uint32_t payload_size, nvme_cb_fn_t cb_fn, void *cb_arg); void nvme_ctrlr_cmd_get_log_page(struct nvme_controller *ctrlr, uint8_t log_page, uint32_t nsid, void *payload, uint32_t payload_size, nvme_cb_fn_t cb_fn, void *cb_arg); /* NVM I/O functions */ int nvme_ns_cmd_write(struct nvme_namespace *ns, void *payload, uint64_t lba, uint32_t lba_count, nvme_cb_fn_t cb_fn, void *cb_arg); int nvme_ns_cmd_write_bio(struct nvme_namespace *ns, struct bio *bp, nvme_cb_fn_t cb_fn, void *cb_arg); int nvme_ns_cmd_read(struct nvme_namespace *ns, void *payload, uint64_t lba, uint32_t lba_count, nvme_cb_fn_t cb_fn, void *cb_arg); int nvme_ns_cmd_read_bio(struct nvme_namespace *ns, struct bio *bp, nvme_cb_fn_t cb_fn, void *cb_arg); int nvme_ns_cmd_deallocate(struct nvme_namespace *ns, void *payload, uint8_t num_ranges, nvme_cb_fn_t cb_fn, void *cb_arg); int nvme_ns_cmd_flush(struct nvme_namespace *ns, nvme_cb_fn_t cb_fn, void *cb_arg); int nvme_ns_dump(struct nvme_namespace *ns, void *virt, off_t offset, size_t len); /* Registration functions */ struct nvme_consumer * nvme_register_consumer(nvme_cons_ns_fn_t ns_fn, nvme_cons_ctrlr_fn_t ctrlr_fn, nvme_cons_async_fn_t async_fn, nvme_cons_fail_fn_t fail_fn); void nvme_unregister_consumer(struct nvme_consumer *consumer); /* Controller helper functions */ device_t nvme_ctrlr_get_device(struct nvme_controller *ctrlr); const struct nvme_controller_data * nvme_ctrlr_get_data(struct nvme_controller *ctrlr); /* Namespace helper functions */ uint32_t nvme_ns_get_max_io_xfer_size(struct nvme_namespace *ns); uint32_t nvme_ns_get_sector_size(struct nvme_namespace *ns); uint64_t nvme_ns_get_num_sectors(struct nvme_namespace *ns); uint64_t nvme_ns_get_size(struct nvme_namespace *ns); uint32_t nvme_ns_get_flags(struct nvme_namespace *ns); const char * nvme_ns_get_serial_number(struct nvme_namespace *ns); const char * nvme_ns_get_model_number(struct nvme_namespace *ns); const struct nvme_namespace_data * nvme_ns_get_data(struct nvme_namespace *ns); uint32_t nvme_ns_get_stripesize(struct nvme_namespace *ns); int nvme_ns_bio_process(struct nvme_namespace *ns, struct bio *bp, nvme_cb_fn_t cb_fn); /* * Command building helper functions -- shared with CAM * These functions assume allocator zeros out cmd structure * CAM's xpt_get_ccb and the request allocator for nvme both * do zero'd allocations. */ static inline void nvme_ns_flush_cmd(struct nvme_command *cmd, uint32_t nsid) { cmd->opc = NVME_OPC_FLUSH; cmd->nsid = nsid; } static inline void nvme_ns_rw_cmd(struct nvme_command *cmd, uint32_t rwcmd, uint32_t nsid, uint64_t lba, uint32_t count) { cmd->opc = rwcmd; cmd->nsid = nsid; cmd->cdw10 = lba & 0xffffffffu; cmd->cdw11 = lba >> 32; cmd->cdw12 = count-1; } static inline void nvme_ns_write_cmd(struct nvme_command *cmd, uint32_t nsid, uint64_t lba, uint32_t count) { nvme_ns_rw_cmd(cmd, NVME_OPC_WRITE, nsid, lba, count); } static inline void nvme_ns_read_cmd(struct nvme_command *cmd, uint32_t nsid, uint64_t lba, uint32_t count) { nvme_ns_rw_cmd(cmd, NVME_OPC_READ, nsid, lba, count); } static inline void nvme_ns_trim_cmd(struct nvme_command *cmd, uint32_t nsid, uint32_t num_ranges) { cmd->opc = NVME_OPC_DATASET_MANAGEMENT; cmd->nsid = nsid; cmd->cdw10 = num_ranges - 1; cmd->cdw11 = NVME_DSM_ATTR_DEALLOCATE; } extern int nvme_use_nvd; #endif /* _KERNEL */ #endif /* __NVME_H__ */